Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c	(revision 275781)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c	(revision 275782)
@@ -1,3707 +1,3707 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
 #include <stdio.h>
 #include <unistd.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_pool.h>
 #include <sys/dbuf.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/dmu_traverse.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/zfeature.h>
 #include <zfs_comutil.h>
 #undef ZFS_MAXNAMELEN
 #undef verify
 #include <libzfs.h>
 
 #define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
 	zio_compress_table[(idx)].ci_name : "UNKNOWN")
 #define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
 	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
 #define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ?	\
 	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ?	\
 	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
 #define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
 	(((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ?	\
 	DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
 
 #ifndef lint
 extern boolean_t zfs_recover;
 extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
 extern int zfs_vdev_async_read_max_active;
 #else
 boolean_t zfs_recover;
 uint64_t zfs_arc_max, zfs_arc_meta_limit;
 int zfs_vdev_async_read_max_active;
 #endif
 
 const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
 
 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
 
 extern void dump_intent_log(zilog_t *);
 uint64_t *zopt_object = NULL;
 int zopt_objects = 0;
 libzfs_handle_t *g_zfs;
 uint64_t max_inflight = 1000;
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init()
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 static void
 usage(void)
 {
 	(void) fprintf(stderr,
 	    "Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
 	    "[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n"
 	    "       %s [-divPA] [-e -p path...] [-U config] dataset "
 	    "[object...]\n"
 	    "       %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
 	    "poolname [vdev [metaslab...]]\n"
 	    "       %s -R [-A] [-e [-p path...]] poolname "
 	    "vdev:offset:size[:flags]\n"
 	    "       %s -S [-PA] [-e [-p path...]] [-U config] poolname\n"
 	    "       %s -l [-uA] device\n"
 	    "       %s -C [-A] [-U config]\n\n",
 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "    Dataset name must include at least one "
 	    "separator character '/' or '@'\n");
 	(void) fprintf(stderr, "    If dataset name is specified, only that "
 	    "dataset is dumped\n");
 	(void) fprintf(stderr, "    If object numbers are specified, only "
 	    "those objects are dumped\n\n");
 	(void) fprintf(stderr, "    Options to control amount of output:\n");
 	(void) fprintf(stderr, "        -u uberblock\n");
 	(void) fprintf(stderr, "        -d dataset(s)\n");
 	(void) fprintf(stderr, "        -i intent logs\n");
 	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
 	(void) fprintf(stderr, "        -h pool history\n");
 	(void) fprintf(stderr, "        -b block statistics\n");
 	(void) fprintf(stderr, "        -m metaslabs\n");
 	(void) fprintf(stderr, "        -M metaslab groups\n");
 	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
 	    "all data) blocks\n");
 	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
 	(void) fprintf(stderr, "        -D dedup statistics\n");
 	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
 	(void) fprintf(stderr, "        -v verbose (applies to all others)\n");
 	(void) fprintf(stderr, "        -l dump label contents\n");
 	(void) fprintf(stderr, "        -L disable leak tracking (do not "
 	    "load spacemaps)\n");
 	(void) fprintf(stderr, "        -R read and display block from a "
 	    "device\n\n");
 	(void) fprintf(stderr, "    Below options are intended for use "
 	    "with other options:\n");
 	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
 	    "panic recovery (-AA) or both (-AAA)\n");
 	(void) fprintf(stderr, "        -F attempt automatic rewind within "
 	    "safe range of transaction groups\n");
 	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
 	    "cachefile\n");
 	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
 	    "work with dataset)\n");
 	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
 	    "has altroot/not in a cachefile\n");
 	(void) fprintf(stderr, "        -p <path> -- use one or more with "
 	    "-e to specify path to vdev dir\n");
 	(void) fprintf(stderr, "        -x <dumpdir> -- "
 	    "dump all read blocks into specified directory\n");
 	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
 	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
 	    "searching for uberblocks\n");
 	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
 	    "specify the maximum number of "
 	    "checksumming I/Os [default is 200]\n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	exit(1);
 }
 
 /*
  * Called for usage errors that are discovered after a call to spa_open(),
  * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
  */
 
 static void
 fatal(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void) fprintf(stderr, "%s: ", cmdname);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
 	exit(1);
 }
 
 /* ARGSUSED */
 static void
 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	nvlist_t *nv;
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
 	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
 
 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
 
 	umem_free(packed, nvsize);
 
 	dump_nvlist(nv, 8);
 
 	nvlist_free(nv);
 }
 
 /* ARGSUSED */
 static void
 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	spa_history_phys_t *shp = data;
 
 	if (shp == NULL)
 		return;
 
 	(void) printf("\t\tpool_create_len = %llu\n",
 	    (u_longlong_t)shp->sh_pool_create_len);
 	(void) printf("\t\tphys_max_off = %llu\n",
 	    (u_longlong_t)shp->sh_phys_max_off);
 	(void) printf("\t\tbof = %llu\n",
 	    (u_longlong_t)shp->sh_bof);
 	(void) printf("\t\teof = %llu\n",
 	    (u_longlong_t)shp->sh_eof);
 	(void) printf("\t\trecords_lost = %llu\n",
 	    (u_longlong_t)shp->sh_records_lost);
 }
 
 static void
 zdb_nicenum(uint64_t num, char *buf)
 {
 	if (dump_opt['P'])
 		(void) sprintf(buf, "%llu", (longlong_t)num);
 	else
 		nicenum(num, buf);
 }
 
 const char histo_stars[] = "****************************************";
 const int histo_width = sizeof (histo_stars) - 1;
 
 static void
 dump_histogram(const uint64_t *histo, int size, int offset)
 {
 	int i;
 	int minidx = size - 1;
 	int maxidx = 0;
 	uint64_t max = 0;
 
 	for (i = 0; i < size; i++) {
 		if (histo[i] > max)
 			max = histo[i];
 		if (histo[i] > 0 && i > maxidx)
 			maxidx = i;
 		if (histo[i] > 0 && i < minidx)
 			minidx = i;
 	}
 
 	if (max < histo_width)
 		max = histo_width;
 
 	for (i = minidx; i <= maxidx; i++) {
 		(void) printf("\t\t\t%3u: %6llu %s\n",
 		    i + offset, (u_longlong_t)histo[i],
 		    &histo_stars[(max - histo[i]) * histo_width / max]);
 	}
 }
 
 static void
 dump_zap_stats(objset_t *os, uint64_t object)
 {
 	int error;
 	zap_stats_t zs;
 
 	error = zap_get_stats(os, object, &zs);
 	if (error)
 		return;
 
 	if (zs.zs_ptrtbl_len == 0) {
 		ASSERT(zs.zs_num_blocks == 1);
 		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
 		    (u_longlong_t)zs.zs_blocksize,
 		    (u_longlong_t)zs.zs_num_entries);
 		return;
 	}
 
 	(void) printf("\tFat ZAP stats:\n");
 
 	(void) printf("\t\tPointer table:\n");
 	(void) printf("\t\t\t%llu elements\n",
 	    (u_longlong_t)zs.zs_ptrtbl_len);
 	(void) printf("\t\t\tzt_blk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
 	(void) printf("\t\t\tzt_numblks: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
 	(void) printf("\t\t\tzt_shift: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
 	(void) printf("\t\t\tzt_blks_copied: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
 	(void) printf("\t\t\tzt_nextblk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
 
 	(void) printf("\t\tZAP entries: %llu\n",
 	    (u_longlong_t)zs.zs_num_entries);
 	(void) printf("\t\tLeaf blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_leafs);
 	(void) printf("\t\tTotal blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_blocks);
 	(void) printf("\t\tzap_block_type: 0x%llx\n",
 	    (u_longlong_t)zs.zs_block_type);
 	(void) printf("\t\tzap_magic: 0x%llx\n",
 	    (u_longlong_t)zs.zs_magic);
 	(void) printf("\t\tzap_salt: 0x%llx\n",
 	    (u_longlong_t)zs.zs_salt);
 
 	(void) printf("\t\tLeafs with 2^n pointers:\n");
 	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks with n*5 entries:\n");
 	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks n/10 full:\n");
 	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tEntries with n chunks:\n");
 	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBuckets with n entries:\n");
 	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
 }
 
 /*ARGSUSED*/
 static void
 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) printf("\tUNKNOWN OBJECT TYPE\n");
 }
 
 /*ARGSUSED*/
 void
 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	void *prop;
 	int i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		prop = umem_zalloc(attr.za_num_integers *
 		    attr.za_integer_length, UMEM_NOFAIL);
 		(void) zap_lookup(os, object, attr.za_name,
 		    attr.za_integer_length, attr.za_num_integers, prop);
 		if (attr.za_integer_length == 1) {
 			(void) printf("%s", (char *)prop);
 		} else {
 			for (i = 0; i < attr.za_num_integers; i++) {
 				switch (attr.za_integer_length) {
 				case 2:
 					(void) printf("%u ",
 					    ((uint16_t *)prop)[i]);
 					break;
 				case 4:
 					(void) printf("%u ",
 					    ((uint32_t *)prop)[i]);
 					break;
 				case 8:
 					(void) printf("%lld ",
 					    (u_longlong_t)((int64_t *)prop)[i]);
 					break;
 				}
 			}
 		}
 		(void) printf("\n");
 		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*ARGSUSED*/
 static void
 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dump_zap_stats(os, object);
 	/* contents are printed elsewhere, properly decoded */
 }
 
 /*ARGSUSED*/
 static void
 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		(void) printf(" %llx : [%d:%d:%d]\n",
 		    (u_longlong_t)attr.za_first_integer,
 		    (int)ATTR_LENGTH(attr.za_first_integer),
 		    (int)ATTR_BSWAP(attr.za_first_integer),
 		    (int)ATTR_NUM(attr.za_first_integer));
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*ARGSUSED*/
 static void
 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	uint16_t *layout_attrs;
 	int i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = [", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 
 		VERIFY(attr.za_integer_length == 2);
 		layout_attrs = umem_zalloc(attr.za_num_integers *
 		    attr.za_integer_length, UMEM_NOFAIL);
 
 		VERIFY(zap_lookup(os, object, attr.za_name,
 		    attr.za_integer_length,
 		    attr.za_num_integers, layout_attrs) == 0);
 
 		for (i = 0; i != attr.za_num_integers; i++)
 			(void) printf(" %d ", (int)layout_attrs[i]);
 		(void) printf("]\n");
 		umem_free(layout_attrs,
 		    attr.za_num_integers * attr.za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*ARGSUSED*/
 static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	const char *typenames[] = {
 		/* 0 */ "not specified",
 		/* 1 */ "FIFO",
 		/* 2 */ "Character Device",
 		/* 3 */ "3 (invalid)",
 		/* 4 */ "Directory",
 		/* 5 */ "5 (invalid)",
 		/* 6 */ "Block Device",
 		/* 7 */ "7 (invalid)",
 		/* 8 */ "Regular File",
 		/* 9 */ "9 (invalid)",
 		/* 10 */ "Symbolic Link",
 		/* 11 */ "11 (invalid)",
 		/* 12 */ "Socket",
 		/* 13 */ "Door",
 		/* 14 */ "Event Port",
 		/* 15 */ "15 (invalid)",
 	};
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = %lld (type: %s)\n",
 		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
 		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
 	}
 	zap_cursor_fini(&zc);
 }
 
 int
 get_dtl_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		space_map_t *sm = vd->vdev_dtl_sm;
 
 		if (sm != NULL &&
 		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 			return (1);
 		return (0);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		refcount += get_dtl_refcount(vd->vdev_child[c]);
 	return (refcount);
 }
 
 int
 get_metaslab_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd && !vd->vdev_removing) {
 		for (int m = 0; m < vd->vdev_ms_count; m++) {
 			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
 
 			if (sm != NULL &&
 			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 				refcount++;
 		}
 	}
 	for (int c = 0; c < vd->vdev_children; c++)
 		refcount += get_metaslab_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 verify_spacemap_refcounts(spa_t *spa)
 {
 	uint64_t expected_refcount = 0;
 	uint64_t actual_refcount;
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
 	    &expected_refcount);
 	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
 	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
 
 	if (expected_refcount != actual_refcount) {
 		(void) printf("space map refcount mismatch: expected %lld != "
 		    "actual %lld\n",
 		    (longlong_t)expected_refcount,
 		    (longlong_t)actual_refcount);
 		return (2);
 	}
 	return (0);
 }
 
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
 	uint64_t alloc, offset, entry;
 	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 			    "INVALID", "INVALID", "INVALID", "INVALID" };
 
 	if (sm == NULL)
 		return;
 
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
 	alloc = 0;
 	for (offset = 0; offset < space_map_length(sm);
 	    offset += sizeof (entry)) {
 		uint8_t mapshift = sm->sm_shift;
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
 		    sizeof (entry), &entry, DMU_READ_PREFETCH));
 		if (SM_DEBUG_DECODE(entry)) {
 
 			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
 			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
 			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
 		} else {
 			(void) printf("\t    [%6llu]    %c  range:"
 			    " %010llx-%010llx  size: %06llx\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
 			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
 			    mapshift) + sm->sm_start),
 			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
 			    mapshift) + sm->sm_start +
 			    (SM_RUN_DECODE(entry) << mapshift)),
 			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
 			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
 				alloc += SM_RUN_DECODE(entry) << mapshift;
 			else
 				alloc -= SM_RUN_DECODE(entry) << mapshift;
 		}
 	}
 	if (alloc != space_map_allocated(sm)) {
 		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
 		    "with space map summary (%llu)\n",
 		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
 	}
 }
 
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
 	char maxbuf[32];
 	range_tree_t *rt = msp->ms_tree;
 	avl_tree_t *t = &msp->ms_size_tree;
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf);
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", avl_numnodes(t), "maxsize", maxbuf,
 	    "freepct", free_pct);
 	(void) printf("\tIn-memory histogram:\n");
 	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslab(metaslab_t *msp)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 	char freebuf[32];
 
 	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf);
 
 	(void) printf(
 	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
 	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
 	    (u_longlong_t)space_map_object(sm), freebuf);
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
 		metaslab_load_wait(msp);
 		if (!msp->ms_loaded) {
 			VERIFY0(metaslab_load(msp));
 			range_tree_stat_verify(msp->ms_tree);
 		}
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	if (dump_opt['m'] > 1 && sm != NULL &&
 	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 		/*
 		 * The space map histogram represents free space in chunks
 		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 		 */
 		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 		    (u_longlong_t)msp->ms_fragmentation);
 		dump_histogram(sm->sm_phys->smp_histogram,
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}
 
 	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
 		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
 
 		mutex_enter(&msp->ms_lock);
 		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 		mutex_exit(&msp->ms_lock);
 	}
 }
 
 static void
 print_vdev_metaslab_header(vdev_t *vd)
 {
 	(void) printf("\tvdev %10llu\n\t%-10s%5llu   %-19s   %-15s   %-10s\n",
 	    (u_longlong_t)vd->vdev_id,
 	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 	    "offset", "spacemap", "free");
 	(void) printf("\t%15s   %19s   %15s   %10s\n",
 	    "---------------", "-------------------",
 	    "---------------", "-------------");
 }
 
 static void
 dump_metaslab_groups(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	uint64_t fragmentation;
 
 	metaslab_class_histogram_verify(mc);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (mg->mg_class != mc)
 			continue;
 
 		metaslab_group_histogram_verify(mg);
 		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
 		    "fragmentation",
 		    (u_longlong_t)tvd->vdev_id,
 		    (u_longlong_t)tvd->vdev_ms_count);
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			(void) printf("%3s\n", "-");
 		} else {
 			(void) printf("%3llu%%\n",
 			    (u_longlong_t)mg->mg_fragmentation);
 		}
 		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 	}
 
 	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
 	fragmentation = metaslab_class_fragmentation(mc);
 	if (fragmentation == ZFS_FRAG_INVALID)
 		(void) printf("\t%3s\n", "-");
 	else
 		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
 	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslabs(spa_t *spa)
 {
 	vdev_t *vd, *rvd = spa->spa_root_vdev;
 	uint64_t m, c = 0, children = rvd->vdev_children;
 
 	(void) printf("\nMetaslabs:\n");
 
 	if (!dump_opt['d'] && zopt_objects > 0) {
 		c = zopt_object[0];
 
 		if (c >= children)
 			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
 
 		if (zopt_objects > 1) {
 			vd = rvd->vdev_child[c];
 			print_vdev_metaslab_header(vd);
 
 			for (m = 1; m < zopt_objects; m++) {
 				if (zopt_object[m] < vd->vdev_ms_count)
 					dump_metaslab(
 					    vd->vdev_ms[zopt_object[m]]);
 				else
 					(void) fprintf(stderr, "bad metaslab "
 					    "number %llu\n",
 					    (u_longlong_t)zopt_object[m]);
 			}
 			(void) printf("\n");
 			return;
 		}
 		children = c + 1;
 	}
 	for (; c < children; c++) {
 		vd = rvd->vdev_child[c];
 		print_vdev_metaslab_header(vd);
 
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
 {
 	const ddt_phys_t *ddp = dde->dde_phys;
 	const ddt_key_t *ddk = &dde->dde_key;
 	char *types[4] = { "ditto", "single", "double", "triple" };
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu %s %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
 		    types[p], blkbuf);
 	}
 }
 
 static void
 dump_dedup_ratio(const ddt_stat_t *dds)
 {
 	double rL, rP, rD, D, dedup, compress, copies;
 
 	if (dds->dds_blocks == 0)
 		return;
 
 	rL = (double)dds->dds_ref_lsize;
 	rP = (double)dds->dds_ref_psize;
 	rD = (double)dds->dds_ref_dsize;
 	D = (double)dds->dds_dsize;
 
 	dedup = rD / D;
 	compress = rL / rP;
 	copies = rD / rP;
 
 	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
 	    "dedup * compress / copies = %.2f\n\n",
 	    dedup, compress, copies, dedup * compress / copies);
 }
 
 static void
 dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 {
 	char name[DDT_NAMELEN];
 	ddt_entry_t dde;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
 	int error;
 
 	error = ddt_object_info(ddt, type, class, &doi);
 
 	if (error == ENOENT)
 		return;
 	ASSERT(error == 0);
 
 	error = ddt_object_count(ddt, type, class, &count);
 	ASSERT(error == 0);
 	if (count == 0)
 		return;
 
 	dspace = doi.doi_physical_blocks_512 << 9;
 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
 	ddt_object_name(ddt, type, class, name);
 
 	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
 	    name,
 	    (u_longlong_t)count,
 	    (u_longlong_t)(dspace / count),
 	    (u_longlong_t)(mspace / count));
 
 	if (dump_opt['D'] < 3)
 		return;
 
 	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
 
 	if (dump_opt['D'] < 4)
 		return;
 
 	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
 		return;
 
 	(void) printf("%s contents:\n\n", name);
 
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
 		dump_dde(ddt, &dde, walk);
 
 	ASSERT(error == ENOENT);
 
 	(void) printf("\n");
 }
 
 static void
 dump_all_ddts(spa_t *spa)
 {
 	ddt_histogram_t ddh_total = { 0 };
 	ddt_stat_t dds_total = { 0 };
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 			for (enum ddt_class class = 0; class < DDT_CLASSES;
 			    class++) {
 				dump_ddt(ddt, type, class);
 			}
 		}
 	}
 
 	ddt_get_dedup_stats(spa, &dds_total);
 
 	if (dds_total.dds_blocks == 0) {
 		(void) printf("All DDTs are empty\n");
 		return;
 	}
 
 	(void) printf("\n");
 
 	if (dump_opt['D'] > 1) {
 		(void) printf("DDT histogram (aggregated over all DDTs):\n");
 		ddt_get_dedup_histogram(spa, &ddh_total);
 		zpool_dump_ddt(&dds_total, &ddh_total);
 	}
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
 	char *prefix = arg;
 
 	(void) printf("%s [%llu,%llu) length %llu\n",
 	    prefix,
 	    (u_longlong_t)start,
 	    (u_longlong_t)(start + size),
 	    (u_longlong_t)(size));
 }
 
 static void
 dump_dtl(vdev_t *vd, int indent)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t required;
 	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
 	char prefix[256];
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 	required = vdev_dtl_required(vd);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
 	(void) printf("\t%*s%s [%s]\n", indent, "",
 	    vd->vdev_path ? vd->vdev_path :
 	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
 	    required ? "DTL-required" : "DTL-expendable");
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		range_tree_t *rt = vd->vdev_dtl[t];
 		if (range_tree_space(rt) == 0)
 			continue;
 		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
 		    indent + 2, "", name[t]);
 		mutex_enter(rt->rt_lock);
 		range_tree_walk(rt, dump_dtl_seg, prefix);
 		mutex_exit(rt->rt_lock);
 		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
 			dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
 /* from spa_history.c: spa_history_create_obj() */
 #define	HIS_BUF_LEN_DEF	(128 << 10)
 #define	HIS_BUF_LEN_MAX	(1 << 30)
 
 static void
 dump_history(spa_t *spa)
 {
 	nvlist_t **events = NULL;
 	char *buf = NULL;
 	uint64_t bufsize = HIS_BUF_LEN_DEF;
 	uint64_t resid, len, off = 0;
 	uint_t num = 0;
 	int error;
 	time_t tsec;
 	struct tm t;
 	char tbuf[30];
 	char internalstr[MAXPATHLEN];
 
 	if ((buf = malloc(bufsize)) == NULL)
 		(void) fprintf(stderr, "Unable to read history: "
 		    "out of memory\n");
 	do {
 		len = bufsize;
 
 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
 			(void) fprintf(stderr, "Unable to read history: "
 			    "error %d\n", error);
 			return;
 		}
 
 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
 			break;
 		off -= resid;
 
 		/*
 		 * If the history block is too big, double the buffer
 		 * size and try again.
 		 */
 		if (resid == len) {
 			free(buf);
 			buf = NULL;
 
 			bufsize <<= 1;
 			if ((bufsize >= HIS_BUF_LEN_MAX) ||
 			    ((buf = malloc(bufsize)) == NULL)) {
 				(void) fprintf(stderr, "Unable to read history: "
 				    "out of memory\n");
 				return;
 			}
 		}
 	} while (len != 0);
 	free(buf);
 
 	(void) printf("\nHistory:\n");
 	for (int i = 0; i < num; i++) {
 		uint64_t time, txg, ievent;
 		char *cmd, *intstr;
 		boolean_t printed = B_FALSE;
 
 		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
 		    &time) != 0)
 			goto next;
 		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
 		    &cmd) != 0) {
 			if (nvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
 				goto next;
 			verify(nvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG, &txg) == 0);
 			verify(nvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR, &intstr) == 0);
 			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
 				goto next;
 
 			(void) snprintf(internalstr,
 			    sizeof (internalstr),
 			    "[internal %s txg:%lld] %s",
 			    zfs_history_event_names[ievent], txg,
 			    intstr);
 			cmd = internalstr;
 		}
 		tsec = time;
 		(void) localtime_r(&tsec, &t);
 		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		(void) printf("%s %s\n", tbuf, cmd);
 		printed = B_TRUE;
 
 next:
 		if (dump_opt['h'] > 1) {
 			if (!printed)
 				(void) printf("unrecognized record:\n");
 			dump_nvlist(events[i], 2);
 		}
 	}
 }
 
 /*ARGSUSED*/
 static void
 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 static uint64_t
 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb)
 {
 	if (dnp == NULL) {
 		ASSERT(zb->zb_level < 0);
 		if (zb->zb_object == 0)
 			return (zb->zb_blkid);
 		return (zb->zb_blkid * BP_GET_LSIZE(bp));
 	}
 
 	ASSERT(zb->zb_level >= 0);
 
 	return ((zb->zb_blkid <<
 	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 
 	if (dump_opt['b'] >= 6) {
 		snprintf_blkptr(blkbuf, buflen, bp);
 		return;
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		(void) sprintf(blkbuf,
 		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
 		    (int)BPE_GET_ETYPE(bp),
 		    (u_longlong_t)BPE_GET_LSIZE(bp),
 		    (u_longlong_t)BPE_GET_PSIZE(bp),
 		    (u_longlong_t)bp->blk_birth);
 		return;
 	}
 
 	blkbuf[0] = '\0';
 	for (int i = 0; i < ndvas; i++)
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
 
 	if (BP_IS_HOLE(bp)) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf), "B=%llu",
 		    (u_longlong_t)bp->blk_birth);
 	} else {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL/%llxP F=%llu B=%llu/%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    (u_longlong_t)bp->blk_birth,
 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
 	}
 }
 
 static void
 print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
     const dnode_phys_t *dnp)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 	int l;
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
 		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
 	}
 
 	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
 	ASSERT(zb->zb_level >= 0);
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
 			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
 		} else {
 			(void) printf(" ");
 		}
 	}
 
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	int err = 0;
 
 	if (bp->blk_birth == 0)
 		return (0);
 
 	print_indirect(bp, zb, dnp);
 
 	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
 		uint32_t flags = ARC_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 		uint64_t fill = 0;
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
 			return (err);
 		ASSERT(buf->b_data);
 
 		/* recursively visit blocks below this */
 		cbp = buf->b_data;
 		for (i = 0; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = visit_indirect(spa, dnp, cbp, &czb);
 			if (err)
 				break;
 			fill += BP_GET_FILL(cbp);
 		}
 		if (!err)
 			ASSERT3U(fill, ==, BP_GET_FILL(bp));
 		(void) arc_buf_remove_ref(buf, &buf);
 	}
 
 	return (err);
 }
 
 /*ARGSUSED*/
 static void
 dump_indirect(dnode_t *dn)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 	int j;
 	zbookmark_phys_t czb;
 
 	(void) printf("Indirect blocks:\n");
 
 	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
 	    dn->dn_object, dnp->dn_nlevels - 1, 0);
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		czb.zb_blkid = j;
 		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
 		    &dnp->dn_blkptr[j], &czb);
 	}
 
 	(void) printf("\n");
 }
 
 /*ARGSUSED*/
 static void
 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dsl_dir_phys_t *dd = data;
 	time_t crtime;
 	char nice[32];
 
 	if (dd == NULL)
 		return;
 
 	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
 
 	crtime = dd->dd_creation_time;
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\thead_dataset_obj = %llu\n",
 	    (u_longlong_t)dd->dd_head_dataset_obj);
 	(void) printf("\t\tparent_dir_obj = %llu\n",
 	    (u_longlong_t)dd->dd_parent_obj);
 	(void) printf("\t\torigin_obj = %llu\n",
 	    (u_longlong_t)dd->dd_origin_obj);
 	(void) printf("\t\tchild_dir_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_child_dir_zapobj);
 	zdb_nicenum(dd->dd_used_bytes, nice);
 	(void) printf("\t\tused_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_compressed_bytes, nice);
 	(void) printf("\t\tcompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_uncompressed_bytes, nice);
 	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_quota, nice);
 	(void) printf("\t\tquota = %s\n", nice);
 	zdb_nicenum(dd->dd_reserved, nice);
 	(void) printf("\t\treserved = %s\n", nice);
 	(void) printf("\t\tprops_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_props_zapobj);
 	(void) printf("\t\tdeleg_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_deleg_zapobj);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)dd->dd_flags);
 
 #define	DO(which) \
 	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
 	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
 	DO(HEAD);
 	DO(SNAP);
 	DO(CHILD);
 	DO(CHILD_RSRV);
 	DO(REFRSRV);
 #undef DO
 }
 
 /*ARGSUSED*/
 static void
 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dsl_dataset_phys_t *ds = data;
 	time_t crtime;
 	char used[32], compressed[32], uncompressed[32], unique[32];
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (ds == NULL)
 		return;
 
 	ASSERT(size == sizeof (*ds));
 	crtime = ds->ds_creation_time;
 	zdb_nicenum(ds->ds_referenced_bytes, used);
 	zdb_nicenum(ds->ds_compressed_bytes, compressed);
 	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
 	zdb_nicenum(ds->ds_unique_bytes, unique);
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
 	(void) printf("\t\tprev_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_obj);
 	(void) printf("\t\tprev_snap_txg = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_txg);
 	(void) printf("\t\tnext_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_snap_obj);
 	(void) printf("\t\tsnapnames_zapobj = %llu\n",
 	    (u_longlong_t)ds->ds_snapnames_zapobj);
 	(void) printf("\t\tnum_children = %llu\n",
 	    (u_longlong_t)ds->ds_num_children);
 	(void) printf("\t\tuserrefs_obj = %llu\n",
 	    (u_longlong_t)ds->ds_userrefs_obj);
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\tcreation_txg = %llu\n",
 	    (u_longlong_t)ds->ds_creation_txg);
 	(void) printf("\t\tdeadlist_obj = %llu\n",
 	    (u_longlong_t)ds->ds_deadlist_obj);
 	(void) printf("\t\tused_bytes = %s\n", used);
 	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
 	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
 	(void) printf("\t\tunique = %s\n", unique);
 	(void) printf("\t\tfsid_guid = %llu\n",
 	    (u_longlong_t)ds->ds_fsid_guid);
 	(void) printf("\t\tguid = %llu\n",
 	    (u_longlong_t)ds->ds_guid);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)ds->ds_flags);
 	(void) printf("\t\tnext_clones_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_clones_obj);
 	(void) printf("\t\tprops_obj = %llu\n",
 	    (u_longlong_t)ds->ds_props_obj);
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
 /* ARGSUSED */
 static int
 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (bp->blk_birth != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
 	return (0);
 }
 
 static void
 dump_bptree(objset_t *os, uint64_t obj, char *name)
 {
 	char bytes[32];
 	bptree_phys_t *bt;
 	dmu_buf_t *db;
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	zdb_nicenum(bt->bt_bytes, bytes);
 	(void) printf("\n    %s: %llu datasets, %s\n",
 	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
 	dmu_buf_rele(db, FTAG);
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	(void) printf("\n");
 
 	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
 }
 
 /* ARGSUSED */
 static int
 dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	ASSERT(bp->blk_birth != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
 }
 
 static void
 dump_bpobj(bpobj_t *bpo, char *name, int indent)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
 		(void) printf("    %*s: object %llu, %llu local blkptrs, "
 		    "%llu subobjs, %s (%s/%s comp)\n",
 		    indent * 8, name,
 		    (u_longlong_t)bpo->bpo_object,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 		    bytes, comp, uncomp);
 
 		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				continue;
 			}
 			dump_bpobj(&subbpo, "subobj", indent + 1);
 			bpobj_close(&subbpo);
 		}
 	} else {
 		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
 		    indent * 8, name,
 		    (u_longlong_t)bpo->bpo_object,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 		    bytes);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 
 	if (indent == 0) {
 		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_deadlist(dsl_deadlist_t *dl)
 {
 	dsl_deadlist_entry_t *dle;
 	uint64_t unused;
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	if (dl->dl_oldfmt) {
 		dump_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
 		return;
 	}
 
 	zdb_nicenum(dl->dl_phys->dl_used, bytes);
 	zdb_nicenum(dl->dl_phys->dl_comp, comp);
 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
 	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
 	    bytes, comp, uncomp);
 
 	if (dump_opt['d'] < 4)
 		return;
 
 	(void) printf("\n");
 
 	/* force the tree to be loaded */
 	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
 
 	for (dle = avl_first(&dl->dl_tree); dle;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		if (dump_opt['d'] >= 5) {
 			char buf[128];
 			(void) snprintf(buf, sizeof (buf), "mintxg %llu -> ",
 			    (longlong_t)dle->dle_mintxg,
 			    (longlong_t)dle->dle_bpobj.bpo_object);
 
 			dump_bpobj(&dle->dle_bpobj, buf, 0);
 		} else {
 			(void) printf("mintxg %llu -> obj %llu\n",
 			    (longlong_t)dle->dle_mintxg,
 			    (longlong_t)dle->dle_bpobj.bpo_object);
 
 		}
 	}
 }
 
 static avl_tree_t idx_tree;
 static avl_tree_t domain_tree;
 static boolean_t fuid_table_loaded;
 static boolean_t sa_loaded;
 sa_attr_type_t *sa_attr_table;
 
 static void
 fuid_table_destroy()
 {
 	if (fuid_table_loaded) {
 		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
 		fuid_table_loaded = B_FALSE;
 	}
 }
 
 /*
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
  * For CIFS files with FUID the fuid is printed in hex followed by
  * the domain-rid string.
  */
 static void
 print_idstr(uint64_t id, const char *id_type)
 {
 	if (FUID_INDEX(id)) {
 		char *domain;
 
 		domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
 		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
 		    (u_longlong_t)id, domain, (int)FUID_RID(id));
 	} else {
 		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
 	}
 
 }
 
 static void
 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
 {
 	uint32_t uid_idx, gid_idx;
 
 	uid_idx = FUID_INDEX(uid);
 	gid_idx = FUID_INDEX(gid);
 
 	/* Load domain table, if not already loaded */
 	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
 		uint64_t fuid_obj;
 
 		/* first find the fuid object.  It lives in the master node */
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
 		    8, 1, &fuid_obj) == 0);
 		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
 		(void) zfs_fuid_table_load(os, fuid_obj,
 		    &idx_tree, &domain_tree);
 		fuid_table_loaded = B_TRUE;
 	}
 
 	print_idstr(uid, "uid");
 	print_idstr(gid, "gid");
 }
 
 /*ARGSUSED*/
 static void
 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
 	sa_handle_t *hdl;
 	uint64_t xattr, rdev, gen;
 	uint64_t uid, gid, mode, fsize, parent, links;
 	uint64_t pflags;
 	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
 	time_t z_crtime, z_atime, z_mtime, z_ctime;
 	sa_bulk_attr_t bulk[12];
 	int idx = 0;
 	int error;
 
 	if (!sa_loaded) {
 		uint64_t sa_attrs = 0;
 		uint64_t version;
 
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 		    8, 1, &version) == 0);
 		if (version >= ZPL_VERSION_SA) {
 			VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
 			    8, 1, &sa_attrs) == 0);
 		}
 		if ((error = sa_setup(os, sa_attrs, zfs_attr_table,
 		    ZPL_END, &sa_attr_table)) != 0) {
 			(void) printf("sa_setup failed errno %d, can't "
 			    "display znode contents\n", error);
 			return;
 		}
 		sa_loaded = B_TRUE;
 	}
 
 	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
 	    &links, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
 	    &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
 	    NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
 	    &fsize, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
 	    acctm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
 	    modtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
 	    crtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
 	    chgtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
 	    &pflags, 8);
 
 	if (sa_bulk_lookup(hdl, bulk, idx)) {
 		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
 	error = zfs_obj_to_path(os, object, path, sizeof (path));
 	if (error != 0) {
 		(void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
 		    (u_longlong_t)object);
 	}
 	if (dump_opt['d'] < 3) {
 		(void) printf("\t%s\n", path);
 		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
 	z_crtime = (time_t)crtm[0];
 	z_atime = (time_t)acctm[0];
 	z_mtime = (time_t)modtm[0];
 	z_ctime = (time_t)chgtm[0];
 
 	(void) printf("\tpath	%s\n", path);
 	dump_uidgid(os, uid, gid);
 	(void) printf("\tatime	%s", ctime(&z_atime));
 	(void) printf("\tmtime	%s", ctime(&z_mtime));
 	(void) printf("\tctime	%s", ctime(&z_ctime));
 	(void) printf("\tcrtime	%s", ctime(&z_crtime));
 	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
 	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
 	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
 	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
 	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
 	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
 	sa_handle_destroy(hdl);
 }
 
 /*ARGSUSED*/
 static void
 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
 	dump_none,		/* unallocated			*/
 	dump_zap,		/* object directory		*/
 	dump_uint64,		/* object array			*/
 	dump_none,		/* packed nvlist		*/
 	dump_packed_nvlist,	/* packed nvlist size		*/
 	dump_none,		/* bplist			*/
 	dump_none,		/* bplist header		*/
 	dump_none,		/* SPA space map header		*/
 	dump_none,		/* SPA space map		*/
 	dump_none,		/* ZIL intent log		*/
 	dump_dnode,		/* DMU dnode			*/
 	dump_dmu_objset,	/* DMU objset			*/
 	dump_dsl_dir,		/* DSL directory		*/
 	dump_zap,		/* DSL directory child map	*/
 	dump_zap,		/* DSL dataset snap map		*/
 	dump_zap,		/* DSL props			*/
 	dump_dsl_dataset,	/* DSL dataset			*/
 	dump_znode,		/* ZFS znode			*/
 	dump_acl,		/* ZFS V0 ACL			*/
 	dump_uint8,		/* ZFS plain file		*/
 	dump_zpldir,		/* ZFS directory		*/
 	dump_zap,		/* ZFS master node		*/
 	dump_zap,		/* ZFS delete queue		*/
 	dump_uint8,		/* zvol object			*/
 	dump_zap,		/* zvol prop			*/
 	dump_uint8,		/* other uint8[]		*/
 	dump_uint64,		/* other uint64[]		*/
 	dump_zap,		/* other ZAP			*/
 	dump_zap,		/* persistent error log		*/
 	dump_uint8,		/* SPA history			*/
 	dump_history_offsets,	/* SPA history offsets		*/
 	dump_zap,		/* Pool properties		*/
 	dump_zap,		/* DSL permissions		*/
 	dump_acl,		/* ZFS ACL			*/
 	dump_uint8,		/* ZFS SYSACL			*/
 	dump_none,		/* FUID nvlist			*/
 	dump_packed_nvlist,	/* FUID nvlist size		*/
 	dump_zap,		/* DSL dataset next clones	*/
 	dump_zap,		/* DSL scrub queue		*/
 	dump_zap,		/* ZFS user/group used		*/
 	dump_zap,		/* ZFS user/group quota		*/
 	dump_zap,		/* snapshot refcount tags	*/
 	dump_ddt_zap,		/* DDT ZAP object		*/
 	dump_zap,		/* DDT statistics		*/
 	dump_znode,		/* SA object			*/
 	dump_zap,		/* SA Master Node		*/
 	dump_sa_attrs,		/* SA attribute registration	*/
 	dump_sa_layouts,	/* SA attribute layouts		*/
 	dump_zap,		/* DSL scrub translations	*/
 	dump_none,		/* fake dedup BP		*/
 	dump_zap,		/* deadlist			*/
 	dump_none,		/* deadlist hdr			*/
 	dump_zap,		/* dsl clones			*/
 	dump_none,		/* bpobj subobjs		*/
 	dump_unknown,		/* Unknown type, must be last	*/
 };
 
 static void
 dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
 {
 	dmu_buf_t *db = NULL;
 	dmu_object_info_t doi;
 	dnode_t *dn;
 	void *bonus = NULL;
 	size_t bsize = 0;
 	char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
 	char bonus_size[32];
 	char aux[50];
 	int error;
 
 	if (*print_header) {
 		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
 		    "%full", "type");
 		*print_header = 0;
 	}
 
 	if (object == 0) {
 		dn = DMU_META_DNODE(os);
 	} else {
 		error = dmu_bonus_hold(os, object, FTAG, &db);
 		if (error)
 			fatal("dmu_bonus_hold(%llu) failed, errno %u",
 			    object, error);
 		bonus = db->db_data;
 		bsize = db->db_size;
 		dn = DB_DNODE((dmu_buf_impl_t *)db);
 	}
 	dmu_object_info_from_dnode(dn, &doi);
 
 	zdb_nicenum(doi.doi_metadata_block_size, iblk);
 	zdb_nicenum(doi.doi_data_block_size, dblk);
 	zdb_nicenum(doi.doi_max_offset, lsize);
 	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
 	zdb_nicenum(doi.doi_bonus_size, bonus_size);
 	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
 	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
 	    doi.doi_max_offset);
 
 	aux[0] = '\0';
 
 	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
 		    ZDB_CHECKSUM_NAME(doi.doi_checksum));
 	}
 
 	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
 		    ZDB_COMPRESS_NAME(doi.doi_compress));
 	}
 
 	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
 	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
 	    asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
 		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "", "", "", "", "", bonus_size, "bonus",
 		    ZDB_OT_NAME(doi.doi_bonus_type));
 	}
 
 	if (verbosity >= 4) {
 		(void) printf("\tdnode flags: %s%s%s\n",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
 		    "USED_BYTES " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
 		    "USERUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
 		    "SPILL_BLKPTR" : "");
 		(void) printf("\tdnode maxblkid: %llu\n",
 		    (longlong_t)dn->dn_phys->dn_maxblkid);
 
 		object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
 		    bonus, bsize);
 		object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
 		*print_header = 1;
 	}
 
 	if (verbosity >= 5)
 		dump_indirect(dn);
 
 	if (verbosity >= 5) {
 		/*
 		 * Report the list of segments that comprise the object.
 		 */
 		uint64_t start = 0;
 		uint64_t end;
 		uint64_t blkfill = 1;
 		int minlvl = 1;
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			minlvl = 0;
 			blkfill = DNODES_PER_BLOCK;
 		}
 
 		for (;;) {
 			char segsize[32];
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
 				break;
 			end = start;
 			error = dnode_next_offset(dn,
 			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
 			zdb_nicenum(end - start, segsize);
 			(void) printf("\t\tsegment [%016llx, %016llx)"
 			    " size %5s\n", (u_longlong_t)start,
 			    (u_longlong_t)end, segsize);
 			if (error)
 				break;
 			start = end;
 		}
 	}
 
 	if (db != NULL)
 		dmu_buf_rele(db, FTAG);
 }
 
 static char *objset_types[DMU_OST_NUMTYPES] = {
 	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
 
 static void
 dump_dir(objset_t *os)
 {
 	dmu_objset_stats_t dds;
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
 	char numbuf[32];
 	char blkbuf[BP_SPRINTF_LEN + 20];
 	char osname[MAXNAMELEN];
 	char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
 	int print_header = 1;
 	int i, error;
 
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	if (dds.dds_type < DMU_OST_NUMTYPES)
 		type = objset_types[dds.dds_type];
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
 		usedobjs = BP_GET_FILL(os->os_rootbp);
-		refdbytes = os->os_spa->spa_dsl_pool->
-		    dp_mos_dir->dd_phys->dd_used_bytes;
+		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
+		    dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
 	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
 
 	zdb_nicenum(refdbytes, numbuf);
 
 	if (verbosity >= 4) {
 		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
 		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
 		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
 
 	dmu_objset_name(os, osname);
 
 	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
 	    "%s, %llu objects%s\n",
 	    osname, type, (u_longlong_t)dmu_objset_id(os),
 	    (u_longlong_t)dds.dds_creation_txg,
 	    numbuf, (u_longlong_t)usedobjs, blkbuf);
 
 	if (zopt_objects != 0) {
 		for (i = 0; i < zopt_objects; i++)
 			dump_object(os, zopt_object[i], verbosity,
 			    &print_header);
 		(void) printf("\n");
 		return;
 	}
 
 	if (dump_opt['i'] != 0 || verbosity >= 2)
 		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL)
 		dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
 
 	if (verbosity < 2)
 		return;
 
 	if (BP_IS_HOLE(os->os_rootbp))
 		return;
 
 	dump_object(os, 0, verbosity, &print_header);
 	object_count = 0;
 	if (DMU_USERUSED_DNODE(os) != NULL &&
 	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
 	}
 
 	object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
 		dump_object(os, object, verbosity, &print_header);
 		object_count++;
 	}
 
 	ASSERT3U(object_count, ==, usedobjs);
 
 	(void) printf("\n");
 
 	if (error != ESRCH) {
 		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
 		abort();
 	}
 }
 
 static void
 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 {
 	time_t timestamp = ub->ub_timestamp;
 
 	(void) printf(header ? header : "");
 	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
 	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
 	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
 	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
 	(void) printf("\ttimestamp = %llu UTC = %s",
 	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
 	if (dump_opt['u'] >= 3) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
 	(void) printf(footer ? footer : "");
 }
 
 static void
 dump_config(spa_t *spa)
 {
 	dmu_buf_t *db;
 	size_t nvsize = 0;
 	int error = 0;
 
 
 	error = dmu_bonus_hold(spa->spa_meta_objset,
 	    spa->spa_config_object, FTAG, &db);
 
 	if (error == 0) {
 		nvsize = *(uint64_t *)db->db_data;
 		dmu_buf_rele(db, FTAG);
 
 		(void) printf("\nMOS Configuration:\n");
 		dump_packed_nvlist(spa->spa_meta_objset,
 		    spa->spa_config_object, (void *)&nvsize, 1);
 	} else {
 		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
 		    (u_longlong_t)spa->spa_config_object, error);
 	}
 }
 
 static void
 dump_cachefile(const char *cachefile)
 {
 	int fd;
 	struct stat64 statbuf;
 	char *buf;
 	nvlist_t *config;
 
 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if ((buf = malloc(statbuf.st_size)) == NULL) {
 		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		exit(1);
 	}
 
 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
 		(void) fprintf(stderr, "failed to read %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		exit(1);
 	}
 
 	(void) close(fd);
 
 	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
 		(void) fprintf(stderr, "failed to unpack nvlist\n");
 		exit(1);
 	}
 
 	free(buf);
 
 	dump_nvlist(config, 0);
 
 	nvlist_free(config);
 }
 
 #define	ZDB_MAX_UB_HEADER_SIZE 32
 
 static void
 dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
 {
 	vdev_t vd;
 	vdev_t *vdp = &vd;
 	char header[ZDB_MAX_UB_HEADER_SIZE];
 
 	vd.vdev_ashift = ashift;
 	vdp->vdev_top = vdp;
 
 	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
 		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
 		uberblock_t *ub = (void *)((char *)lbl + uoff);
 
 		if (uberblock_verify(ub))
 			continue;
 		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
 		    "Uberblock[%d]\n", i);
 		dump_uberblock(ub, header, "");
 	}
 }
 
 static void
 dump_label(const char *dev)
 {
 	int fd;
 	vdev_label_t label;
 	char *path, *buf = label.vl_vdev_phys.vp_nvlist;
 	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
 	struct stat64 statbuf;
 	uint64_t psize, ashift;
 	int len = strlen(dev) + 1;
 
 	if (strncmp(dev, "/dev/dsk/", 9) == 0) {
 		len++;
 		path = malloc(len);
 		(void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
 	} else {
 		path = strdup(dev);
 	}
 
 	if ((fd = open64(path, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
 		free(path);
 		exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", path,
 		    strerror(errno));
 		free(path);
 		(void) close(fd);
 		exit(1);
 	}
 
 	if (S_ISBLK(statbuf.st_mode)) {
 		(void) printf("cannot use '%s': character device required\n",
 		    path);
 		free(path);
 		(void) close(fd);
 		exit(1);
 	}
 
 	psize = statbuf.st_size;
 	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		nvlist_t *config = NULL;
 
 		(void) printf("--------------------------------------------\n");
 		(void) printf("LABEL %d\n", l);
 		(void) printf("--------------------------------------------\n");
 
 		if (pread64(fd, &label, sizeof (label),
 		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
 			(void) printf("failed to read label %d\n", l);
 			continue;
 		}
 
 		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
 			(void) printf("failed to unpack label %d\n", l);
 			ashift = SPA_MINBLOCKSHIFT;
 		} else {
 			nvlist_t *vdev_tree = NULL;
 
 			dump_nvlist(config, 4);
 			if ((nvlist_lookup_nvlist(config,
 			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
 			    (nvlist_lookup_uint64(vdev_tree,
 			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
 				ashift = SPA_MINBLOCKSHIFT;
 			nvlist_free(config);
 		}
 		if (dump_opt['u'])
 			dump_label_uberblocks(&label, ashift);
 	}
 
 	free(path);
 	(void) close(fd);
 }
 
 static uint64_t num_large_blocks;
 
 /*ARGSUSED*/
 static int
 dump_one_dir(const char *dsname, void *arg)
 {
 	int error;
 	objset_t *os;
 
 	error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
 	if (error) {
 		(void) printf("Could not open %s, error %d\n", dsname, error);
 		return (0);
 	}
 	if (dmu_objset_ds(os)->ds_large_blocks)
 		num_large_blocks++;
 	dump_dir(os);
 	dmu_objset_disown(os, FTAG);
 	fuid_table_destroy();
 	sa_loaded = B_FALSE;
 	return (0);
 }
 
 /*
  * Block statistics.
  */
 #define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
 	uint64_t zb_asize;
 	uint64_t zb_lsize;
 	uint64_t zb_psize;
 	uint64_t zb_count;
 	uint64_t zb_gangs;
 	uint64_t zb_ditto_samevdev;
 	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
 } zdb_blkstats_t;
 
 /*
  * Extended object types to report deferred frees and dedup auto-ditto blocks.
  */
 #define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
 #define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
 #define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
 #define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
 
 static char *zdb_ot_extname[] = {
 	"deferred free",
 	"dedup ditto",
 	"other",
 	"Total",
 };
 
 #define	ZB_TOTAL	DN_MAX_LEVELS
 
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
 	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
 	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
 	    [BPE_PAYLOAD_SIZE];
 	uint64_t	zcb_start;
 	uint64_t	zcb_lastprint;
 	uint64_t	zcb_totalasize;
 	uint64_t	zcb_errors[256];
 	int		zcb_readfails;
 	int		zcb_haderrors;
 	spa_t		*zcb_spa;
 } zdb_cb_t;
 
 static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	uint64_t refcnt = 0;
 
 	ASSERT(type < ZDB_OT_TOTAL);
 
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
 	for (int i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
 		int equal;
 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
 
 		/*
 		 * The histogram is only big enough to record blocks up to
 		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
 		 * "other", bucket.
 		 */
 		int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
 		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
 		zb->zb_psize_histogram[idx]++;
 
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1]))
 				zb->zb_ditto_samevdev++;
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal != 0)
 				zb->zb_ditto_samevdev++;
 			break;
 		}
 
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
 		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
 		    [BPE_GET_PSIZE(bp)]++;
 		return;
 	}
 
 	if (dump_opt['L'])
 		return;
 
 	if (BP_GET_DEDUP(bp)) {
 		ddt_t *ddt;
 		ddt_entry_t *dde;
 
 		ddt = ddt_select(zcb->zcb_spa, bp);
 		ddt_enter(ddt);
 		dde = ddt_lookup(ddt, bp, B_FALSE);
 
 		if (dde == NULL) {
 			refcnt = 0;
 		} else {
 			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
 			ddt_phys_decref(ddp);
 			refcnt = ddp->ddp_refcnt;
 			if (ddt_phys_total_refcnt(dde) == 0)
 				ddt_remove(ddt, dde);
 		}
 		ddt_exit(ddt);
 	}
 
 	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
 	    refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
 	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
 /* ARGSUSED */
 static void
 zdb_blkptr_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	int ioerr = zio->io_error;
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;
 
 	zio_data_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 
 	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		char blkbuf[BP_SPRINTF_LEN];
 
 		zcb->zcb_haderrors = 1;
 		zcb->zcb_errors[ioerr]++;
 
 		if (dump_opt['b'] >= 2)
 			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		else
 			blkbuf[0] = '\0';
 
 		(void) printf("zdb_blkptr_cb: "
 		    "Got error %d reading "
 		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
 		    ioerr,
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level,
 		    (u_longlong_t)zb->zb_blkid,
 		    blkbuf);
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /* ARGSUSED */
 static int
 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zdb_cb_t *zcb = arg;
 	dmu_object_type_t type;
 	boolean_t is_metadata;
 
 	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
 		    "level %lld offset 0x%llx %s\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (longlong_t)zb->zb_level,
 		    (u_longlong_t)blkid2offset(dnp, bp, zb),
 		    blkbuf);
 	}
 
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	type = BP_GET_TYPE(bp);
 
 	zdb_count_block(zcb, zilog, bp,
 	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
 
 	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
 	if (!BP_IS_EMBEDDED(bp) &&
 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
 		void *data = zio_data_buf_alloc(size);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
 		if (zb->zb_level == ZB_ZIL_LEVEL)
 			flags |= ZIO_FLAG_SPECULATIVE;
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight > max_inflight)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight++;
 		mutex_exit(&spa->spa_scrub_lock);
 
 		zio_nowait(zio_read(NULL, spa, bp, data, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
 	zcb->zcb_readfails = 0;
 
 	/* only call gethrtime() every 100 blocks */
 	static int iters;
 	if (++iters > 100)
 		iters = 0;
 	else
 		return (0);
 
 	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
 		uint64_t now = gethrtime();
 		char buf[10];
 		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
 		int kb_per_sec =
 		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
 		int sec_remaining =
 		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
 
 		zfs_nicenum(bytes, buf, sizeof (buf));
 		(void) fprintf(stderr,
 		    "\r%5s completed (%4dMB/s) "
 		    "estimated time remaining: %uhr %02umin %02usec        ",
 		    buf, kb_per_sec / 1024,
 		    sec_remaining / 60 / 60,
 		    sec_remaining / 60 % 60,
 		    sec_remaining % 60);
 
 		zcb->zcb_lastprint = now;
 	}
 
 	return (0);
 }
 
 static void
 zdb_leak(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
 }
 
 static metaslab_ops_t zdb_metaslab_ops = {
 	NULL	/* alloc */
 };
 
 static void
 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	ddt_bookmark_t ddb = { 0 };
 	ddt_entry_t dde;
 	int error;
 
 	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
 		blkptr_t blk;
 		ddt_phys_t *ddp = dde.dde_phys;
 
 		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
 			return;
 
 		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0)
 				continue;
 			ddt_bp_create(ddb.ddb_checksum,
 			    &dde.dde_key, ddp, &blk);
 			if (p == DDT_PHYS_DITTO) {
 				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
 			} else {
 				zcb->zcb_dedup_asize +=
 				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
 				zcb->zcb_dedup_blocks++;
 			}
 		}
 		if (!dump_opt['L']) {
 			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
 			ddt_enter(ddt);
 			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
 			ddt_exit(ddt);
 		}
 	}
 
 	ASSERT(error == ENOENT);
 }
 
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	zcb->zcb_spa = spa;
 
 	if (!dump_opt['L']) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 			for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
 				mutex_enter(&msp->ms_lock);
 				metaslab_unload(msp);
 
 				/*
 				 * For leak detection, we overload the metaslab
 				 * ms_tree to contain allocated segments
 				 * instead of free segments. As a result,
 				 * we can't use the normal metaslab_load/unload
 				 * interfaces.
 				 */
 				if (msp->ms_sm != NULL) {
 					(void) fprintf(stderr,
 					    "\rloading space map for "
 					    "vdev %llu of %llu, "
 					    "metaslab %llu of %llu ...",
 					    (longlong_t)c,
 					    (longlong_t)rvd->vdev_children,
 					    (longlong_t)m,
 					    (longlong_t)vd->vdev_ms_count);
 
 					msp->ms_ops = &zdb_metaslab_ops;
 
 					/*
 					 * We don't want to spend the CPU
 					 * manipulating the size-ordered
 					 * tree, so clear the range_tree
 					 * ops.
 					 */
 					msp->ms_tree->rt_ops = NULL;
 					VERIFY0(space_map_load(msp->ms_sm,
 					    msp->ms_tree, SM_ALLOC));
 					msp->ms_loaded = B_TRUE;
 				}
 				mutex_exit(&msp->ms_lock);
 			}
 		}
 		(void) fprintf(stderr, "\n");
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	zdb_ddt_leak_init(spa, zcb);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static void
 zdb_leak_fini(spa_t *spa)
 {
 	if (!dump_opt['L']) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 			for (int m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
 				mutex_enter(&msp->ms_lock);
 
 				/*
 				 * The ms_tree has been overloaded to
 				 * contain allocated segments. Now that we
 				 * finished traversing all blocks, any
 				 * block that remains in the ms_tree
 				 * represents an allocated block that we
 				 * did not claim during the traversal.
 				 * Claimed blocks would have been removed
 				 * from the ms_tree.
 				 */
 				range_tree_vacate(msp->ms_tree, zdb_leak, vd);
 				msp->ms_loaded = B_FALSE;
 
 				mutex_exit(&msp->ms_lock);
 			}
 		}
 	}
 }
 
 /* ARGSUSED */
 static int
 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zdb_cb_t *zcb = arg;
 
 	if (dump_opt['b'] >= 5) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("[%s] %s\n",
 		    "deferred free", blkbuf);
 	}
 	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
 	return (0);
 }
 
 static int
 dump_block_stats(spa_t *spa)
 {
 	zdb_cb_t zcb = { 0 };
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
 	boolean_t leaks = B_FALSE;
 
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
 	    dump_opt['c'] ? "checksums " : "",
 	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
 	    !dump_opt['L'] ? "nothing leaked " : "");
 
 	/*
 	 * Load all space maps as SM_ALLOC maps, then traverse the pool
 	 * claiming each block we discover.  If the pool is perfectly
 	 * consistent, the space maps will be empty when we're done.
 	 * Anything left over is a leak; any block we can't claim (because
 	 * it's not part of any space map) is a double allocation,
 	 * reference to a freed block, or an unclaimed log block.
 	 */
 	zdb_leak_init(spa, &zcb);
 
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
 	    count_block_cb, &zcb, NULL);
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
 		    count_block_cb, &zcb, NULL);
 	}
 	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
 		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
 		    &zcb, NULL));
 	}
 
 	if (dump_opt['c'] > 1)
 		flags |= TRAVERSE_PREFETCH_DATA;
 
 	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
 	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
 	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
 
 	/*
 	 * If we've traversed the data blocks then we need to wait for those
 	 * I/Os to complete. We leverage "The Godfather" zio to wait on
 	 * all async I/Os to complete.
 	 */
 	if (dump_opt['c']) {
 		for (int i = 0; i < max_ncpus; i++) {
 			(void) zio_wait(spa->spa_async_zio_root[i]);
 			spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_GODFATHER);
 		}
 	}
 
 	if (zcb.zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
 		for (int e = 0; e < 256; e++) {
 			if (zcb.zcb_errors[e] != 0) {
 				(void) printf("\t%5d  %llu\n",
 				    e, (u_longlong_t)zcb.zcb_errors[e]);
 			}
 		}
 	}
 
 	/*
 	 * Report any leaked segments.
 	 */
 	zdb_leak_fini(spa);
 
 	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
 
 	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	norm_space = metaslab_class_get_space(spa_normal_class(spa));
 
 	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
 	total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
 
 	if (total_found == total_alloc) {
 		if (!dump_opt['L'])
 			(void) printf("\n\tNo leaks (block sum matches space"
 			    " maps exactly)\n");
 	} else {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
 		    (u_longlong_t)total_found,
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
 		leaks = B_TRUE;
 	}
 
 	if (tzb->zb_count == 0)
 		return (2);
 
 	(void) printf("\n");
 	(void) printf("\tbp count:      %10llu\n",
 	    (u_longlong_t)tzb->zb_count);
 	(void) printf("\tganged count:  %10llu\n",
 	    (longlong_t)tzb->zb_gangs);
 	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
 	(void) printf("\tbp physical:   %10llu      avg:"
 	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_psize,
 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_psize);
 	(void) printf("\tbp allocated:  %10llu      avg:"
 	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_asize,
 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_asize);
 	(void) printf("\tbp deduped:    %10llu    ref>1:"
 	    " %6llu   deduplication: %6.2f\n",
 	    (u_longlong_t)zcb.zcb_dedup_asize,
 	    (u_longlong_t)zcb.zcb_dedup_blocks,
 	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
 	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
 	for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
 		if (zcb.zcb_embedded_blocks[i] == 0)
 			continue;
 		(void) printf("\n");
 		(void) printf("\tadditional, non-pointer bps of type %u: "
 		    "%10llu\n",
 		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
 
 		if (dump_opt['b'] >= 3) {
 			(void) printf("\t number of (compressed) bytes:  "
 			    "number of bps\n");
 			dump_histogram(zcb.zcb_embedded_histogram[i],
 			    sizeof (zcb.zcb_embedded_histogram[i]) /
 			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
 		}
 	}
 
 	if (tzb->zb_ditto_samevdev != 0) {
 		(void) printf("\tDittoed blocks on same vdev: %llu\n",
 		    (longlong_t)tzb->zb_ditto_samevdev);
 	}
 
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
 		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
 			char csize[32], lsize[32], psize[32], asize[32];
 			char avg[32], gang[32];
 			char *typename;
 
 			if (t < DMU_OT_NUMTYPES)
 				typename = dmu_ot[t].ot_name;
 			else
 				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
 			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
 				(void) printf("%6s\t%5s\t%5s\t%5s"
 				    "\t%5s\t%5s\t%6s\t%s\n",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    typename);
 				continue;
 			}
 
 			for (l = ZB_TOTAL - 1; l >= -1; l--) {
 				level = (l == -1 ? ZB_TOTAL : l);
 				zb = &zcb.zcb_type[level][t];
 
 				if (zb->zb_asize == 0)
 					continue;
 
 				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
 					continue;
 
 				if (level == 0 && zb->zb_asize ==
 				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
 					continue;
 
 				zdb_nicenum(zb->zb_count, csize);
 				zdb_nicenum(zb->zb_lsize, lsize);
 				zdb_nicenum(zb->zb_psize, psize);
 				zdb_nicenum(zb->zb_asize, asize);
 				zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
 				zdb_nicenum(zb->zb_gangs, gang);
 
 				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 				    "\t%5.2f\t%6.2f\t",
 				    csize, lsize, psize, asize, avg,
 				    (double)zb->zb_lsize / zb->zb_psize,
 				    100.0 * zb->zb_asize / tzb->zb_asize);
 
 				if (level == ZB_TOTAL)
 					(void) printf("%s\n", typename);
 				else
 					(void) printf("    L%d %s\n",
 					    level, typename);
 
 				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
 					(void) printf("\t number of ganged "
 					    "blocks: %s\n", gang);
 				}
 
 				if (dump_opt['b'] >= 4) {
 					(void) printf("psize "
 					    "(in 512-byte sectors): "
 					    "number of blocks\n");
 					dump_histogram(zb->zb_psize_histogram,
 					    PSIZE_HISTO_SIZE, 0);
 				}
 			}
 		}
 	}
 
 	(void) printf("\n");
 
 	if (leaks)
 		return (2);
 
 	if (zcb.zcb_haderrors)
 		return (3);
 
 	return (0);
 }
 
 typedef struct zdb_ddt_entry {
 	ddt_key_t	zdde_key;
 	uint64_t	zdde_ref_blocks;
 	uint64_t	zdde_ref_lsize;
 	uint64_t	zdde_ref_psize;
 	uint64_t	zdde_ref_dsize;
 	avl_node_t	zdde_node;
 } zdb_ddt_entry_t;
 
 /* ARGSUSED */
 static int
 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	avl_tree_t *t = arg;
 	avl_index_t where;
 	zdb_ddt_entry_t *zdde, zdde_search;
 
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
 		(void) printf("traversing objset %llu, %llu objects, "
 		    "%lu blocks so far\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    avl_numnodes(t));
 	}
 
 	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
 	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 		return (0);
 
 	ddt_key_fill(&zdde_search.zdde_key, bp);
 
 	zdde = avl_find(t, &zdde_search, &where);
 
 	if (zdde == NULL) {
 		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
 		zdde->zdde_key = zdde_search.zdde_key;
 		avl_insert(t, zdde, where);
 	}
 
 	zdde->zdde_ref_blocks += 1;
 	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
 	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
 	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
 
 	return (0);
 }
 
 static void
 dump_simulated_ddt(spa_t *spa)
 {
 	avl_tree_t t;
 	void *cookie = NULL;
 	zdb_ddt_entry_t *zdde;
 	ddt_histogram_t ddh_total = { 0 };
 	ddt_stat_t dds_total = { 0 };
 
 	avl_create(&t, ddt_entry_compare,
 	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
 	    zdb_ddt_add_cb, &t);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
 		ddt_stat_t dds;
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);
 
 		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
 		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
 		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
 		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
 
 		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
 		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
 		dds.dds_ref_psize = zdde->zdde_ref_psize;
 		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
 
 		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
 		    &dds, 0);
 
 		umem_free(zdde, sizeof (*zdde));
 	}
 
 	avl_destroy(&t);
 
 	ddt_histogram_stat(&dds_total, &ddh_total);
 
 	(void) printf("Simulated DDT histogram:\n");
 
 	zpool_dump_ddt(&dds_total, &ddh_total);
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
 	if (dump_opt['S']) {
 		dump_simulated_ddt(spa);
 		return;
 	}
 
 	if (!dump_opt['e'] && dump_opt['C'] > 1) {
 		(void) printf("\nCached configuration:\n");
 		dump_nvlist(spa->spa_config, 8);
 	}
 
 	if (dump_opt['C'])
 		dump_config(spa);
 
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
 
 	if (dump_opt['D'])
 		dump_all_ddts(spa);
 
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
 		dump_metaslab_groups(spa);
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		uint64_t refcount;
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
 			dump_bpobj(&spa->spa_deferred_bpobj,
 			    "Deferred frees", 0);
 			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 				dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
 				    "Pool snapshot frees", 0);
 			}
 
 			if (spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY)) {
 				dump_bptree(spa->spa_meta_objset,
 				    spa->spa_dsl_pool->dp_bptree_obj,
 				    "Pool dataset frees");
 			}
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 
 		(void) feature_get_refcount(spa,
 		    &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount);
 		if (num_large_blocks != refcount) {
 			(void) printf("large_blocks feature refcount mismatch: "
 			    "expected %lld != actual %lld\n",
 			    (longlong_t)num_large_blocks,
 			    (longlong_t)refcount);
 			rc = 2;
 		} else {
 			(void) printf("Verified large_blocks feature refcount "
 			    "is correct (%llu)\n", (longlong_t)refcount);
 		}
 	}
 	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
 	if (rc == 0)
 		rc = verify_spacemap_refcounts(spa);
 
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
 	if (dump_opt['h'])
 		dump_history(spa);
 
 	if (rc != 0)
 		exit(rc);
 }
 
 #define	ZDB_FLAG_CHECKSUM	0x0001
 #define	ZDB_FLAG_DECOMPRESS	0x0002
 #define	ZDB_FLAG_BSWAP		0x0004
 #define	ZDB_FLAG_GBH		0x0008
 #define	ZDB_FLAG_INDIRECT	0x0010
 #define	ZDB_FLAG_PHYS		0x0020
 #define	ZDB_FLAG_RAW		0x0040
 #define	ZDB_FLAG_PRINT_BLKPTR	0x0080
 
 int flagbits[256];
 
 static void
 zdb_print_blkptr(blkptr_t *bp, int flags)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
 
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static void
 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
 {
 	int i;
 
 	for (i = 0; i < nbps; i++)
 		zdb_print_blkptr(&bp[i], flags);
 }
 
 static void
 zdb_dump_gbh(void *buf, int flags)
 {
 	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
 }
 
 static void
 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
 {
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array(buf, size);
 	(void) write(1, buf, size);
 }
 
 static void
 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
 {
 	uint64_t *d = (uint64_t *)buf;
 	int nwords = size / sizeof (uint64_t);
 	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
 	int i, j;
 	char *hdr, *c;
 
 
 	if (do_bswap)
 		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
 	else
 		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
 
 	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
 
 	for (i = 0; i < nwords; i += 2) {
 		(void) printf("%06llx:  %016llx  %016llx  ",
 		    (u_longlong_t)(i * sizeof (uint64_t)),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
 
 		c = (char *)&d[i];
 		for (j = 0; j < 2 * sizeof (uint64_t); j++)
 			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
 		(void) printf("\n");
 	}
 }
 
 /*
  * There are two acceptable formats:
  *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
  *	child[.child]*    - For example: 0.1.1
  *
  * The second form can be used to specify arbitrary vdevs anywhere
  * in the heirarchy.  For example, in a pool with a mirror of
  * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
  */
 static vdev_t *
 zdb_vdev_lookup(vdev_t *vdev, char *path)
 {
 	char *s, *p, *q;
 	int i;
 
 	if (vdev == NULL)
 		return (NULL);
 
 	/* First, assume the x.x.x.x format */
 	i = (int)strtoul(path, &s, 10);
 	if (s == path || (s && *s != '.' && *s != '\0'))
 		goto name;
 	if (i < 0 || i >= vdev->vdev_children)
 		return (NULL);
 
 	vdev = vdev->vdev_child[i];
 	if (*s == '\0')
 		return (vdev);
 	return (zdb_vdev_lookup(vdev, s+1));
 
 name:
 	for (i = 0; i < vdev->vdev_children; i++) {
 		vdev_t *vc = vdev->vdev_child[i];
 
 		if (vc->vdev_path == NULL) {
 			vc = zdb_vdev_lookup(vc, path);
 			if (vc == NULL)
 				continue;
 			else
 				return (vc);
 		}
 
 		p = strrchr(vc->vdev_path, '/');
 		p = p ? p + 1 : vc->vdev_path;
 		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
 
 		if (strcmp(vc->vdev_path, path) == 0)
 			return (vc);
 		if (strcmp(p, path) == 0)
 			return (vc);
 		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
 			return (vc);
 	}
 
 	return (NULL);
 }
 
 /*
  * Read a block from a pool and print it out.  The syntax of the
  * block descriptor is:
  *
  *	pool:vdev_specifier:offset:size[:flags]
  *
  *	pool           - The name of the pool you wish to read from
  *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
  *	offset         - offset, in hex, in bytes
  *	size           - Amount of data to read, in hex, in bytes
  *	flags          - A string of characters specifying options
  *		 b: Decode a blkptr at given offset within block
  *		*c: Calculate and display checksums
  *		 d: Decompress data before dumping
  *		 e: Byteswap data before dumping
  *		 g: Display data as a gang block header
  *		 i: Display as an indirect block
  *		 p: Do I/O to physical offset
  *		 r: Dump raw data to stdout
  *
  *              * = not yet implemented
  */
 static void
 zdb_read_block(char *thing, spa_t *spa)
 {
 	blkptr_t blk, *bp = &blk;
 	dva_t *dva = bp->blk_dva;
 	int flags = 0;
 	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
 	void *pbuf, *lbuf, *buf;
 	char *s, *p, *dup, *vdev, *flagstr;
 	int i, error;
 
 	dup = strdup(thing);
 	s = strtok(dup, ":");
 	vdev = s ? s : "";
 	s = strtok(NULL, ":");
 	offset = strtoull(s ? s : "", NULL, 16);
 	s = strtok(NULL, ":");
 	size = strtoull(s ? s : "", NULL, 16);
 	s = strtok(NULL, ":");
 	flagstr = s ? s : "";
 
 	s = NULL;
 	if (size == 0)
 		s = "size must not be zero";
 	if (!IS_P2ALIGNED(size, DEV_BSIZE))
 		s = "size must be a multiple of sector size";
 	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
 		s = "offset must be a multiple of sector size";
 	if (s) {
 		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
 		free(dup);
 		return;
 	}
 
 	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
 		for (i = 0; flagstr[i]; i++) {
 			int bit = flagbits[(uchar_t)flagstr[i]];
 
 			if (bit == 0) {
 				(void) printf("***Invalid flag: %c\n",
 				    flagstr[i]);
 				continue;
 			}
 			flags |= bit;
 
 			/* If it's not something with an argument, keep going */
 			if ((bit & (ZDB_FLAG_CHECKSUM |
 			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
 				continue;
 
 			p = &flagstr[i + 1];
 			if (bit == ZDB_FLAG_PRINT_BLKPTR)
 				blkptr_offset = strtoull(p, &p, 16);
 			if (*p != ':' && *p != '\0') {
 				(void) printf("***Invalid flag arg: '%s'\n", s);
 				free(dup);
 				return;
 			}
 			i += p - &flagstr[i + 1]; /* skip over the number */
 		}
 	}
 
 	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
 	if (vd == NULL) {
 		(void) printf("***Invalid vdev: %s\n", vdev);
 		free(dup);
 		return;
 	} else {
 		if (vd->vdev_path)
 			(void) fprintf(stderr, "Found vdev: %s\n",
 			    vd->vdev_path);
 		else
 			(void) fprintf(stderr, "Found vdev type: %s\n",
 			    vd->vdev_ops->vdev_op_type);
 	}
 
 	psize = size;
 	lsize = size;
 
 	pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 	BP_ZERO(bp);
 
 	DVA_SET_VDEV(&dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&dva[0], offset);
 	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
 	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
 
 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
 
 	BP_SET_LSIZE(bp, lsize);
 	BP_SET_PSIZE(bp, psize);
 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(bp, DMU_OT_NONE);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	zio = zio_root(spa, NULL, NULL, 0);
 
 	if (vd == vd->vdev_top) {
 		/*
 		 * Treat this as a normal block read.
 		 */
 		zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
 	} else {
 		/*
 		 * Treat this as a vdev child I/O.
 		 */
 		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
 		    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
 	}
 
 	error = zio_wait(zio);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (error) {
 		(void) printf("Read of %s failed, error: %d\n", thing, error);
 		goto out;
 	}
 
 	if (flags & ZDB_FLAG_DECOMPRESS) {
 		/*
 		 * We don't know how the data was compressed, so just try
 		 * every decompress function at every inflated blocksize.
 		 */
 		enum zio_compress c;
 		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 		bcopy(pbuf, pbuf2, psize);
 
 		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
 		    SPA_MAXBLOCKSIZE - psize) == 0);
 
 		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
 		    SPA_MAXBLOCKSIZE - psize) == 0);
 
 		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
 		    lsize -= SPA_MINBLOCKSIZE) {
 			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
 				if (zio_decompress_data(c, pbuf, lbuf,
 				    psize, lsize) == 0 &&
 				    zio_decompress_data(c, pbuf2, lbuf2,
 				    psize, lsize) == 0 &&
 				    bcmp(lbuf, lbuf2, lsize) == 0)
 					break;
 			}
 			if (c != ZIO_COMPRESS_FUNCTIONS)
 				break;
 			lsize -= SPA_MINBLOCKSIZE;
 		}
 
 		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
 		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
 
 		if (lsize <= psize) {
 			(void) printf("Decompress of %s failed\n", thing);
 			goto out;
 		}
 		buf = lbuf;
 		size = lsize;
 	} else {
 		buf = pbuf;
 		size = psize;
 	}
 
 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
 		zdb_print_blkptr((blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
 	else if (flags & ZDB_FLAG_RAW)
 		zdb_dump_block_raw(buf, size, flags);
 	else if (flags & ZDB_FLAG_INDIRECT)
 		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
 		    flags);
 	else if (flags & ZDB_FLAG_GBH)
 		zdb_dump_gbh(buf, flags);
 	else
 		zdb_dump_block(thing, buf, size, flags);
 
 out:
 	umem_free(pbuf, SPA_MAXBLOCKSIZE);
 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 	free(dup);
 }
 
 static boolean_t
 pool_match(nvlist_t *cfg, char *tgt)
 {
 	uint64_t v, guid = strtoull(tgt, NULL, 0);
 	char *s;
 
 	if (guid != 0) {
 		if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
 			return (v == guid);
 	} else {
 		if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
 			return (strcmp(s, tgt) == 0);
 	}
 	return (B_FALSE);
 }
 
 static char *
 find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
 {
 	nvlist_t *pools;
 	nvlist_t *match = NULL;
 	char *name = NULL;
 	char *sepp = NULL;
 	char sep;
 	int count = 0;
 	importargs_t args = { 0 };
 
 	args.paths = dirc;
 	args.path = dirv;
 	args.can_be_active = B_TRUE;
 
 	if ((sepp = strpbrk(*target, "/@")) != NULL) {
 		sep = *sepp;
 		*sepp = '\0';
 	}
 
 	pools = zpool_search_import(g_zfs, &args);
 
 	if (pools != NULL) {
 		nvpair_t *elem = NULL;
 		while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
 			verify(nvpair_value_nvlist(elem, configp) == 0);
 			if (pool_match(*configp, *target)) {
 				count++;
 				if (match != NULL) {
 					/* print previously found config */
 					if (name != NULL) {
 						(void) printf("%s\n", name);
 						dump_nvlist(match, 8);
 						name = NULL;
 					}
 					(void) printf("%s\n",
 					    nvpair_name(elem));
 					dump_nvlist(*configp, 8);
 				} else {
 					match = *configp;
 					name = nvpair_name(elem);
 				}
 			}
 		}
 	}
 	if (count > 1)
 		(void) fatal("\tMatched %d pools - use pool GUID "
 		    "instead of pool name or \n"
 		    "\tpool name part of a dataset name to select pool", count);
 
 	if (sepp)
 		*sepp = sep;
 	/*
 	 * If pool GUID was specified for pool id, replace it with pool name
 	 */
 	if (name && (strstr(*target, name) != *target)) {
 		int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
 
 		*target = umem_alloc(sz, UMEM_NOFAIL);
 		(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
 	}
 
 	*configp = name ? match : NULL;
 
 	return (name);
 }
 
 int
 main(int argc, char **argv)
 {
 	int i, c;
 	struct rlimit rl = { 1024, 1024 };
 	spa_t *spa = NULL;
 	objset_t *os = NULL;
 	int dump_all = 1;
 	int verbose = 0;
 	int error = 0;
 	char **searchdirs = NULL;
 	int nsearch = 0;
 	char *target;
 	nvlist_t *policy = NULL;
 	uint64_t max_txg = UINT64_MAX;
 	int rewind = ZPOOL_NEVER_REWIND;
 
 	(void) setrlimit(RLIMIT_NOFILE, &rl);
 	(void) enable_extended_FILE_stdio(-1, -1);
 
 	dprintf_setup(&argc, argv);
 
 	while ((c = getopt(argc, argv,
 	    "bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) {
 		switch (c) {
 		case 'b':
 		case 'c':
 		case 'd':
 		case 'h':
 		case 'i':
 		case 'l':
 		case 'm':
 		case 's':
 		case 'u':
 		case 'C':
 		case 'D':
 		case 'M':
 		case 'R':
 		case 'S':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
 		case 'A':
 		case 'F':
 		case 'L':
 		case 'X':
 		case 'e':
 		case 'P':
 			dump_opt[c]++;
 			break;
 		case 'I':
 			max_inflight = strtoull(optarg, NULL, 0);
 			if (max_inflight == 0) {
 				(void) fprintf(stderr, "maximum number "
 				    "of inflight I/Os must be greater "
 				    "than 0\n");
 				usage();
 			}
 			break;
 		case 'p':
 			if (searchdirs == NULL) {
 				searchdirs = umem_alloc(sizeof (char *),
 				    UMEM_NOFAIL);
 			} else {
 				char **tmp = umem_alloc((nsearch + 1) *
 				    sizeof (char *), UMEM_NOFAIL);
 				bcopy(searchdirs, tmp, nsearch *
 				    sizeof (char *));
 				umem_free(searchdirs,
 				    nsearch * sizeof (char *));
 				searchdirs = tmp;
 			}
 			searchdirs[nsearch++] = optarg;
 			break;
 		case 't':
 			max_txg = strtoull(optarg, NULL, 0);
 			if (max_txg < TXG_INITIAL) {
 				(void) fprintf(stderr, "incorrect txg "
 				    "specified: %s\n", optarg);
 				usage();
 			}
 			break;
 		case 'U':
 			spa_config_path = optarg;
 			break;
 		case 'v':
 			verbose++;
 			break;
 		case 'x':
 			vn_dumpdir = optarg;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	if (!dump_opt['e'] && searchdirs != NULL) {
 		(void) fprintf(stderr, "-p option requires use of -e\n");
 		usage();
 	}
 
 	/*
 	 * ZDB does not typically re-read blocks; therefore limit the ARC
 	 * to 256 MB, which can be used entirely for metadata.
 	 */
 	zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
 
 	/*
 	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
 	 * "zdb -b" uses traversal prefetch which uses async reads.
 	 * For good performance, let several of them be active at once.
 	 */
 	zfs_vdev_async_read_max_active = 10;
 
 	kernel_init(FREAD);
 	g_zfs = libzfs_init();
 	ASSERT(g_zfs != NULL);
 
 	if (dump_all)
 		verbose = MAX(verbose, 1);
 
 	for (c = 0; c < 256; c++) {
 		if (dump_all && !strchr("elAFLRSXP", c))
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
 	}
 
 	aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
 	zfs_recover = (dump_opt['A'] > 1);
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 2 && dump_opt['R'])
 		usage();
 	if (argc < 1) {
 		if (!dump_opt['e'] && dump_opt['C']) {
 			dump_cachefile(spa_config_path);
 			return (0);
 		}
 		usage();
 	}
 
 	if (dump_opt['l']) {
 		dump_label(argv[0]);
 		return (0);
 	}
 
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
 
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
 	    nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
 		fatal("internal error: %s", strerror(ENOMEM));
 
 	error = 0;
 	target = argv[0];
 
 	if (dump_opt['e']) {
 		nvlist_t *cfg = NULL;
 		char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
 
 		error = ENOENT;
 		if (name) {
 			if (dump_opt['C'] > 1) {
 				(void) printf("\nConfiguration for import:\n");
 				dump_nvlist(cfg, 8);
 			}
 			if (nvlist_add_nvlist(cfg,
 			    ZPOOL_REWIND_POLICY, policy) != 0) {
 				fatal("can't open '%s': %s",
 				    target, strerror(ENOMEM));
 			}
 			if ((error = spa_import(name, cfg, NULL,
 			    ZFS_IMPORT_MISSING_LOG)) != 0) {
 				error = spa_import(name, cfg, NULL,
 				    ZFS_IMPORT_VERBATIM);
 			}
 		}
 	}
 
 	if (error == 0) {
 		if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
 			error = spa_open_rewind(target, &spa, FTAG, policy,
 			    NULL);
 			if (error) {
 				/*
 				 * If we're missing the log device then
 				 * try opening the pool after clearing the
 				 * log state.
 				 */
 				mutex_enter(&spa_namespace_lock);
 				if ((spa = spa_lookup(target)) != NULL &&
 				    spa->spa_log_state == SPA_LOG_MISSING) {
 					spa->spa_log_state = SPA_LOG_CLEAR;
 					error = 0;
 				}
 				mutex_exit(&spa_namespace_lock);
 
 				if (!error) {
 					error = spa_open_rewind(target, &spa,
 					    FTAG, policy, NULL);
 				}
 			}
 		} else {
 			error = dmu_objset_own(target, DMU_OST_ANY,
 			    B_TRUE, FTAG, &os);
 		}
 	}
 	nvlist_free(policy);
 
 	if (error)
 		fatal("can't open '%s': %s", target, strerror(error));
 
 	argv++;
 	argc--;
 	if (!dump_opt['R']) {
 		if (argc > 0) {
 			zopt_objects = argc;
 			zopt_object = calloc(zopt_objects, sizeof (uint64_t));
 			for (i = 0; i < zopt_objects; i++) {
 				errno = 0;
 				zopt_object[i] = strtoull(argv[i], NULL, 0);
 				if (zopt_object[i] == 0 && errno != 0)
 					fatal("bad number %s: %s",
 					    argv[i], strerror(errno));
 			}
 		}
 		if (os != NULL) {
 			dump_dir(os);
 		} else if (zopt_objects > 0 && !dump_opt['m']) {
 			dump_dir(spa->spa_meta_objset);
 		} else {
 			dump_zpool(spa);
 		}
 	} else {
 		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
 		flagbits['c'] = ZDB_FLAG_CHECKSUM;
 		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
 		flagbits['e'] = ZDB_FLAG_BSWAP;
 		flagbits['g'] = ZDB_FLAG_GBH;
 		flagbits['i'] = ZDB_FLAG_INDIRECT;
 		flagbits['p'] = ZDB_FLAG_PHYS;
 		flagbits['r'] = ZDB_FLAG_RAW;
 
 		for (i = 0; i < argc; i++)
 			zdb_read_block(argv[i], spa);
 	}
 
 	(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
 
 	fuid_table_destroy();
 	sa_loaded = B_FALSE;
 
 	libzfs_fini(g_zfs);
 	kernel_fini();
 
 	return (0);
 }
Index: head/cddl/contrib/opensolaris
===================================================================
--- head/cddl/contrib/opensolaris	(revision 275781)
+++ head/cddl/contrib/opensolaris	(revision 275782)

Property changes on: head/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/illumos/dist:r275551
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	(revision 275782)
@@ -1,2858 +1,2836 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_tx.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/blkptr.h>
 #include <sys/range_tree.h>
 
 /*
  * Number of times that zfs_free_range() took the slow path while doing
  * a zfs receive.  A nonzero value indicates a potential performance problem.
  */
 uint64_t zfs_free_range_recv_miss;
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
 /*
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_cache;
 
 /* ARGSUSED */
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
 	dmu_buf_impl_t *db = vdb;
 	bzero(db, sizeof (dmu_buf_impl_t));
 
 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 	refcount_create(&db->db_holds);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dbuf_dest(void *vdb, void *unused)
 {
 	dmu_buf_impl_t *db = vdb;
 	mutex_destroy(&db->db_mtx);
 	cv_destroy(&db->db_changed);
 	refcount_destroy(&db->db_holds);
 }
 
 /*
  * dbuf hash table routines
  */
 static dbuf_hash_table_t dbuf_hash_table;
 
 static uint64_t dbuf_hash_count;
 
 static uint64_t
 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
 {
 	uintptr_t osv = (uintptr_t)os;
 	uint64_t crc = -1ULL;
 
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
 
 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
 
 	return (crc);
 }
 
 #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
 
 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
 	((dbuf)->db.db_object == (obj) &&		\
 	(dbuf)->db_objset == (os) &&			\
 	(dbuf)->db_level == (level) &&			\
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	objset_t *os = dn->dn_objset;
 	uint64_t obj = dn->dn_object;
 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *db;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
 			if (db->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
 			mutex_exit(&db->db_mtx);
 		}
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  */
 static dmu_buf_impl_t *
 dbuf_hash_insert(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	objset_t *os = db->db_objset;
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid = db->db_blkid;
 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
 			if (dbf->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
 			mutex_exit(&dbf->db_mtx);
 		}
 	}
 
 	mutex_enter(&db->db_mtx);
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	atomic_inc_64(&dbuf_hash_count);
 
 	return (NULL);
 }
 
 /*
  * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf, **dbp;
 
 	/*
 	 * We musn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	dbp = &h->hash_table[idx];
 	while ((dbf = *dbp) != db) {
 		dbp = &dbf->db_hash_next;
 		ASSERT(dbf != NULL);
 	}
 	*dbp = db->db_hash_next;
 	db->db_hash_next = NULL;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	atomic_dec_64(&dbuf_hash_count);
 }
 
 static arc_evict_func_t dbuf_do_evict;
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_level != 0 || db->db_evict_func == NULL)
 		return;
 
-	if (db->db_user_data_ptr_ptr)
-		*db->db_user_data_ptr_ptr = db->db.db_data;
 	db->db_evict_func(&db->db, db->db_user_ptr);
 	db->db_user_ptr = NULL;
-	db->db_user_data_ptr_ptr = NULL;
 	db->db_evict_func = NULL;
 }
 
 boolean_t
 dbuf_is_metadata(dmu_buf_impl_t *db)
 {
 	if (db->db_level > 0) {
 		return (B_TRUE);
 	} else {
 		boolean_t is_metadata;
 
 		DB_DNODE_ENTER(db);
 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 		DB_DNODE_EXIT(db);
 
 		return (is_metadata);
 	}
 }
 
 void
 dbuf_evict(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
 	dbuf_clear(db);
 	dbuf_destroy(db);
 }
 
 void
 dbuf_init(void)
 {
 	uint64_t hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average 4K block size.  The table will take up
 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 	 */
 	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
 		hsize <<= 1;
 
 retry:
 	h->hash_table_mask = hsize - 1;
 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 	if (h->hash_table == NULL) {
 		/* XXX - we should really return an error instead of assert */
 		ASSERT(hsize > (1ULL << 10));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 }
 
 void
 dbuf_fini(void)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 	kmem_cache_destroy(dbuf_cache);
 }
 
 /*
  * Other stuff.
  */
 
 #ifdef ZFS_DEBUG
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dbuf_dirty_record_t *dr;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 		return;
 
 	ASSERT(db->db_objset != NULL);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn == NULL) {
 		ASSERT(db->db_parent == NULL);
 		ASSERT(db->db_blkptr == NULL);
 	} else {
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 		    db->db_blkid == DMU_SPILL_BLKID ||
 		    !avl_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT0(db->db.db_offset);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
 
 	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
 		ASSERT(dr->dr_dbuf == db);
 
 	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
 		ASSERT(dr->dr_dbuf == db);
 
 	/*
 	 * We can't assert that db_size matches dn_datablksz because it
 	 * can be momentarily different when another thread is doing
 	 * dnode_set_blksz().
 	 */
 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dr = db->db_data_pending;
 		/*
 		 * It should only be modified in syncing context, so
 		 * make sure we only have one copy of the data.
 		 */
 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 	}
 
 	/* verify db->db_blkptr */
 	if (db->db_blkptr) {
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
 			if (db->db_blkid != DMU_SPILL_BLKID)
 				ASSERT3P(db->db_blkptr, ==,
 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		} else {
 			/* db is pointed to by an indirect block */
 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 			ASSERT3U(db->db_parent->db.db_object, ==,
 			    db->db.db_object);
 			/*
 			 * dnode_grow_indblksz() can make this fail if we don't
 			 * have the struct_rwlock.  XXX indblksz no longer
 			 * grows.  safe to do this now?
 			 */
 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 				ASSERT3P(db->db_blkptr, ==,
 				    ((blkptr_t *)db->db_parent->db.db_data +
 				    db->db_blkid % epb));
 			}
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 	    (db->db_buf == NULL || db->db_buf->b_data) &&
 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
 		/*
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
 		 * data when we evict this buffer.
 		 */
 		if (db->db_dirtycnt == 0) {
 			uint64_t *buf = db->db.db_data;
 			int i;
 
 			for (i = 0; i < db->db.db_size >> 3; i++) {
 				ASSERT(buf[i] == 0);
 			}
 		}
 	}
 	DB_DNODE_EXIT(db);
 }
 #endif
 
 static void
-dbuf_update_data(dmu_buf_impl_t *db)
-{
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
-		ASSERT(!refcount_is_zero(&db->db_holds));
-		*db->db_user_data_ptr_ptr = db->db.db_data;
-	}
-}
-
-static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	db->db_buf = buf;
 	if (buf != NULL) {
 		ASSERT(buf->b_data != NULL);
 		db->db.db_data = buf->b_data;
 		if (!arc_released(buf))
 			arc_set_callback(buf, dbuf_do_evict, db);
-		dbuf_update_data(db);
 	} else {
 		dbuf_evict_user(db);
 		db->db.db_data = NULL;
 		if (db->db_state != DB_NOFILL)
 			db->db_state = DB_UNCACHED;
 	}
 }
 
 /*
  * Loan out an arc_buf for read.  Return the loaned arc_buf.
  */
 arc_buf_t *
 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 {
 	arc_buf_t *abuf;
 
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
 		spa_t *spa = db->db_objset->os_spa;
 
 		mutex_exit(&db->db_mtx);
 		abuf = arc_loan_buf(spa, blksz);
 		bcopy(db->db.db_data, abuf->b_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
 		dbuf_set_data(db, NULL);
 		mutex_exit(&db->db_mtx);
 	}
 	return (abuf);
 }
 
 uint64_t
 dbuf_whichblock(dnode_t *dn, uint64_t offset)
 {
 	if (dn->dn_datablkshift) {
 		return (offset >> dn->dn_datablkshift);
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
 	}
 }
 
 static void
 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(db->db_state, ==, DB_READ);
 	/*
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
 	ASSERT(refcount_count(&db->db_holds) > 0);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
 		arc_release(buf, db);
 		bzero(buf->b_data, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else if (zio == NULL || zio->io_error == 0) {
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
 		VERIFY(arc_buf_remove_ref(buf, db));
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
 	dbuf_rele_and_unlock(db, NULL);
 }
 
 static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 {
 	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_NOWAIT;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED);
 	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 
 		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
 		if (bonuslen)
 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 		DB_DNODE_EXIT(db);
-		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	/*
 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 	 * processes the delete record and clears the bp while we are waiting
 	 * for the dn_mtx (resulting in a "no" from block_freed).
 	 */
 	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
 	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
 	    BP_IS_HOLE(db->db_blkptr)))) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 		DB_DNODE_EXIT(db);
 		dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
 		    db->db.db_size, db, type));
 		bzero(db->db.db_data, db->db.db_size);
 		db->db_state = DB_CACHED;
 		*flags |= DB_RF_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	DB_DNODE_EXIT(db);
 
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
 	if (DBUF_IS_L2CACHEABLE(db))
 		aflags |= ARC_L2CACHE;
 	if (DBUF_IS_L2COMPRESSIBLE(db))
 		aflags |= ARC_L2COMPRESS;
 
 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	dbuf_add_ref(db, NULL);
 
 	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
 	if (aflags & ARC_CACHED)
 		*flags |= DB_RF_CACHED;
 }
 
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t havepzio = (zio != NULL);
 	boolean_t prefetch;
 	dnode_t *dn;
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
 	 * can't be freed while we have a hold on the buffer.
 	 */
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	if (db->db_state == DB_NOFILL)
 		return (SET_ERROR(EIO));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 	    DBUF_IS_CACHEABLE(db);
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 			    db->db.db_size, TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 	} else if (db->db_state == DB_UNCACHED) {
 		spa_t *spa = dn->dn_objset->os_spa;
 
 		if (zio == NULL)
 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 		dbuf_read_impl(db, zio, &flags);
 
 		/* dbuf_read_impl has dropped db_mtx for us */
 
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 			    db->db.db_size, flags & DB_RF_CACHED);
 
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 
 		if (!havepzio)
 			err = zio_wait(zio);
 	} else {
 		/*
 		 * Another reader came in while the dbuf was in flight
 		 * between UNCACHED and CACHED.  Either a writer will finish
 		 * writing the buffer (sending the dbuf to CACHED) or the
 		 * first reader's request will reach the read_done callback
 		 * and send the dbuf to CACHED.  Otherwise, a failure
 		 * occurred and the dbuf went to UNCACHED.
 		 */
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 			    db->db.db_size, TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 
 		/* Skip the wait per the caller's request. */
 		mutex_enter(&db->db_mtx);
 		if ((flags & DB_RF_NEVERWAIT) == 0) {
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL) {
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
 				    db, zio_t *, zio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 		}
 		mutex_exit(&db->db_mtx);
 	}
 
 	ASSERT(err || havepzio || db->db_state == DB_CACHED);
 	return (err);
 }
 
 static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 		db->db_state = DB_FILL;
 	} else if (db->db_state == DB_NOFILL) {
 		dbuf_set_data(db, NULL);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 /*
  * This is our just-in-time copy function.  It makes a copy of
  * buffers, that have been modified in a previous transaction
  * group, before we modify them in the current active group.
  *
  * This function is used in two places: when we are dirtying a
  * buffer for the first time in a txg, and when we are freeing
  * a range in a dnode that includes this buffer.
  *
  * Note that when we are called from dbuf_free_range() we do
  * not put a hold on the buffer, we just traverse the active
  * dbuf list for the dnode.
  */
 static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
 	ASSERT(db->db_level == 0);
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
 	if (dr == NULL ||
 	    (dr->dt.dl.dr_data !=
 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 		return;
 
 	/*
 	 * If the last dirty record for this dbuf has not yet synced
 	 * and its referencing the dbuf data, either:
 	 *	reset the reference to point to a new copy,
 	 * or (if there a no active holders)
 	 *	just null out the current db_data pointer.
 	 */
 	ASSERT(dr->dr_txg >= txg - 2);
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		/* Note that the data bufs here are zio_bufs */
 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = db->db.db_size;
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 
 		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 	} else {
 		dbuf_set_data(db, NULL);
 	}
 }
 
 void
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 	uint64_t txg = dr->dr_txg;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 	ASSERT(db->db_level == 0);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 		zio_free(db->db_objset->os_spa, txg, bp);
 
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
 
 	/*
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
 	 * modifying the buffer, so they will immediately do
 	 * another (redundant) arc_release().  Therefore, leave
 	 * the buf thawed to save the effort of freezing &
 	 * immediately re-thawing it.
 	 */
 	arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
  * empty blocks.
  *
  * This is a no-op if the dataset is in the middle of an incremental
  * receive; see comment below for details.
  */
 void
 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db, *db_next, db_search;
 	uint64_t txg = tx->tx_txg;
 	avl_index_t where;
 
 	if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
 		end_blkid = dn->dn_maxblkid;
 	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
 
 	db_search.db_level = 0;
 	db_search.db_blkid = start_blkid;
 	db_search.db_state = DB_SEARCH;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	if (start_blkid >= dn->dn_unlisted_l0_blkid) {
 		/* There can't be any dbufs in this range; no need to search. */
 #ifdef DEBUG
 		db = avl_find(&dn->dn_dbufs, &db_search, &where);
 		ASSERT3P(db, ==, NULL);
 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 		ASSERT(db == NULL || db->db_level > 0);
 #endif
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return;
 	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
 		/*
 		 * If we are receiving, we expect there to be no dbufs in
 		 * the range to be freed, because receive modifies each
 		 * block at most once, and in offset order.  If this is
 		 * not the case, it can lead to performance problems,
 		 * so note that we unexpectedly took the slow path.
 		 */
 		atomic_inc_64(&zfs_free_range_recv_miss);
 	}
 
 	db = avl_find(&dn->dn_dbufs, &db_search, &where);
 	ASSERT3P(db, ==, NULL);
 	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
 	for (; db != NULL; db = db_next) {
 		db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 		if (db->db_level != 0 || db->db_blkid > end_blkid) {
 			break;
 		}
 		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/* found a level 0 buffer in the range */
 		mutex_enter(&db->db_mtx);
 		if (dbuf_undirty(db, tx)) {
 			/* mutex has been dropped and dbuf destroyed */
 			continue;
 		}
 
 		if (db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 			/* will be handled in dbuf_read_done or dbuf_rele */
 			db->db_freed_in_flight = TRUE;
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (refcount_count(&db->db_holds) == 0) {
 			ASSERT(db->db_buf);
 			dbuf_clear(db);
 			continue;
 		}
 		/* The dbuf is referenced */
 
 		if (db->db_last_dirty != NULL) {
 			dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 			if (dr->dr_txg == txg) {
 				/*
 				 * This buffer is "in-use", re-adjust the file
 				 * size to reflect that this buffer may
 				 * contain new data when we sync.
 				 */
 				if (db->db_blkid != DMU_SPILL_BLKID &&
 				    db->db_blkid > dn->dn_maxblkid)
 					dn->dn_maxblkid = db->db_blkid;
 				dbuf_unoverride(dr);
 			} else {
 				/*
 				 * This dbuf is not dirty in the open context.
 				 * Either uncache it (if its not referenced in
 				 * the open context) or reset its contents to
 				 * empty.
 				 */
 				dbuf_fix_old_data(db, txg);
 			}
 		}
 		/* clear the contents if its cached */
 		if (db->db_state == DB_CACHED) {
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			bzero(db->db.db_data, db->db.db_size);
 			arc_buf_freeze(db->db_buf);
 		}
 
 		mutex_exit(&db->db_mtx);
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
 static int
 dbuf_block_freeable(dmu_buf_impl_t *db)
 {
 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 	uint64_t birth_txg = 0;
 
 	/*
 	 * We don't need any locking to protect db_blkptr:
 	 * If it's syncing, then db_last_dirty will be set
 	 * so we'll ignore db_blkptr.
 	 *
 	 * This logic ensures that only block births for
 	 * filled blocks are considered.
 	 */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	if (db->db_last_dirty && (db->db_blkptr == NULL ||
 	    !BP_IS_HOLE(db->db_blkptr))) {
 		birth_txg = db->db_last_dirty->dr_txg;
 	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
 		birth_txg = db->db_blkptr->blk_birth;
 	}
 
 	/*
 	 * If this block don't exist or is in a snapshot, it can't be freed.
 	 * Don't pass the bp to dsl_dataset_block_freeable() since we
 	 * are holding the db_mtx lock and might deadlock if we are
 	 * prefetching a dedup-ed block.
 	 */
 	if (birth_txg != 0)
 		return (ds == NULL ||
 		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
 	else
 		return (B_FALSE);
 }
 
 void
 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 {
 	arc_buf_t *buf, *obuf;
 	int osize = db->db.db_size;
 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 	dnode_t *dn;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	/* XXX does *this* func really need the lock? */
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
 	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
 	 * is OK, because there can be no other references to the db
 	 * when we are changing its size, so no concurrent DB_FILL can
 	 * be happening.
 	 */
 	/*
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
 	 */
 	dmu_buf_will_dirty(&db->db, tx);
 
 	/* create the data buffer for the new block */
 	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
 
 	/* copy old block data to the new block */
 	obuf = db->db_buf;
 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
 	/* zero the remainder */
 	if (size > osize)
 		bzero((uint8_t *)buf->b_data + osize, size - osize);
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
 	VERIFY(arc_buf_remove_ref(obuf, db));
 	db->db.db_size = size;
 
 	if (db->db_level == 0) {
 		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 		db->db_last_dirty->dt.dl.dr_data = buf;
 	}
 	mutex_exit(&db->db_mtx);
 
 	dnode_willuse_space(dn, size-osize, tx);
 	DB_DNODE_EXIT(db);
 }
 
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
 	objset_t *os = db->db_objset;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(arc_released(os->os_phys_buf) ||
 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 
 	(void) arc_release(db->db_buf, db);
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	objset_t *os;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
 	boolean_t do_free_accounting = B_FALSE;
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	DMU_TX_DIRTY_BUF(tx, db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/*
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
 	 */
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    dn->dn_objset->os_dsl_dataset == NULL);
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
 	 * in syncing context.
 	 */
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * XXX make this true for indirects too?  The problem is that
 	 * transactions created with dmu_tx_create_assigned() from
 	 * syncing context don't bother holding ahead.
 	 */
 	ASSERT(db->db_level != 0 ||
 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
 	    db->db_state == DB_NOFILL);
 
 	mutex_enter(&dn->dn_mtx);
 	/*
 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
 	 * initialize the objset.
 	 */
 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
 	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
 		dn->dn_dirtyctx =
 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
 		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
 	}
 	mutex_exit(&dn->dn_mtx);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		dn->dn_have_spill = B_TRUE;
 
 	/*
 	 * If this buffer is already dirty, we're done.
 	 */
 	drp = &db->db_last_dirty;
 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
 	    db->db.db_object == DMU_META_DNODE_OBJECT);
 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
 		drp = &dr->dr_next;
 	if (dr && dr->dr_txg == tx->tx_txg) {
 		DB_DNODE_EXIT(db);
 
 		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
 			/*
 			 * If this buffer has already been written out,
 			 * we now need to reset its state.
 			 */
 			dbuf_unoverride(dr);
 			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
 			    db->db_state != DB_NOFILL)
 				arc_buf_thaw(db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
 		return (dr);
 	}
 
 	/*
 	 * Only valid if not already dirty.
 	 */
 	ASSERT(dn->dn_object == 0 ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
 	    dn->dn_phys->dn_nlevels > db->db_level ||
 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
 
 	/*
 	 * We should only be dirtying in syncing context if it's the
 	 * mos or we're initializing the os or it's a special object.
 	 * However, we are allowed to dirty in syncing context provided
 	 * we already dirtied it in open context.  Hence we must make
 	 * this assertion only if we're not already dirty.
 	 */
 	os = dn->dn_objset;
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * Update the accounting.
 		 * Note: we delay "free accounting" until after we drop
 		 * the db_mtx.  This keeps us from grabbing other locks
 		 * (and possibly deadlocking) in bp_get_dsize() while
 		 * also holding the db_mtx.
 		 */
 		dnode_willuse_space(dn, db->db.db_size, tx);
 		do_free_accounting = dbuf_block_freeable(db);
 	}
 
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
 	 * transaction group won't leak out when we sync the older txg.
 	 */
 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
 	if (db->db_level == 0) {
 		void *data_old = db->db_buf;
 
 		if (db->db_state != DB_NOFILL) {
 			if (db->db_blkid == DMU_BONUS_BLKID) {
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db.db_data;
 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
 				/*
 				 * Release the data buffer from the cache so
 				 * that we can modify it without impacting
 				 * possible other users of this cached data
 				 * block.  Note that indirect blocks and
 				 * private objects are not released until the
 				 * syncing state (since they are only modified
 				 * then).
 				 */
 				arc_release(db->db_buf, db);
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db_buf;
 			}
 			ASSERT(data_old != NULL);
 		}
 		dr->dt.dl.dr_data = data_old;
 	} else {
 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
 		list_create(&dr->dt.di.dr_children,
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
 		dr->dr_accounted = db->db.db_size;
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	dr->dr_next = *drp;
 	*drp = dr;
 
 	/*
 	 * We could have been freed_in_flight between the dbuf_noread
 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
 	 * happened after the free.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_free_ranges[txgoff] != NULL) {
 			range_tree_clear(dn->dn_free_ranges[txgoff],
 			    db->db_blkid, 1);
 		}
 		mutex_exit(&dn->dn_mtx);
 		db->db_freed_in_flight = FALSE;
 	}
 
 	/*
 	 * This buffer is now part of this txg
 	 */
 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
 	db->db_dirtycnt += 1;
 	ASSERT3U(db->db_dirtycnt, <=, 3);
 
 	mutex_exit(&db->db_mtx);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
 		DB_DNODE_EXIT(db);
 		return (dr);
 	} else if (do_free_accounting) {
 		blkptr_t *bp = db->db_blkptr;
 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
 		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
 		/*
 		 * This is only a guess -- if the dbuf is dirty
 		 * in a previous txg, we don't know how much
 		 * space it will use on disk yet.  We should
 		 * really have the struct_rwlock to access
 		 * db_blkptr, but since this is just a guess,
 		 * it's OK if we get an odd answer.
 		 */
 		ddt_prefetch(os->os_spa, bp);
 		dnode_willuse_space(dn, -willfree, tx);
 	}
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 
 	if (db->db_level == 0) {
 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
 	}
 
 	if (db->db_level+1 < dn->dn_nlevels) {
 		dmu_buf_impl_t *parent = db->db_parent;
 		dbuf_dirty_record_t *di;
 		int parent_held = FALSE;
 
 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 			parent = dbuf_hold_level(dn, db->db_level+1,
 			    db->db_blkid >> epbs, FTAG);
 			ASSERT(parent != NULL);
 			parent_held = TRUE;
 		}
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 		ASSERT3U(db->db_level+1, ==, parent->db_level);
 		di = dbuf_dirty(parent, tx);
 		if (parent_held)
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
 		/*
 		 * Since we've dropped the mutex, it's possible that
 		 * dbuf_undirty() might have changed this out from under us.
 		 */
 		if (db->db_last_dirty == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
 			ASSERT(!list_link_active(&dr->dr_dirty_node));
 			list_insert_tail(&di->dt.di.dr_children, dr);
 			mutex_exit(&di->dt.di.dr_mtx);
 			dr->dr_parent = di;
 		}
 		mutex_exit(&db->db_mtx);
 	} else {
 		ASSERT(db->db_level+1 == dn->dn_nlevels);
 		ASSERT(db->db_blkid < dn->dn_nblkptr);
 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	dnode_setdirty(dn, tx);
 	DB_DNODE_EXIT(db);
 	return (dr);
 }
 
 /*
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
 static boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	uint64_t txg = tx->tx_txg;
 	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg <= txg)
 			break;
 	if (dr == NULL || dr->dr_txg < txg)
 		return (B_FALSE);
 	ASSERT(dr->dr_txg == txg);
 	ASSERT(dr->dr_dbuf == db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	ASSERT(db->db.db_size != 0);
 
 	/*
 	 * Any space we accounted for in dp_dirty_* will be cleaned up by
 	 * dsl_pool_sync().  This is relatively rare so the discrepancy
 	 * is not a big deal.
 	 */
 
 	*drp = dr->dr_next;
 
 	/*
 	 * Note that there are three places in dbuf_dirty()
 	 * where this dirty record may be put on a list.
 	 * Make sure to do a list_remove corresponding to
 	 * every one of those list_insert calls.
 	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
 	    db->db_level+1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
 	}
 	DB_DNODE_EXIT(db);
 
 	if (db->db_state != DB_NOFILL) {
 		dbuf_unoverride(dr);
 
 		ASSERT(db->db_buf != NULL);
 		ASSERT(dr->dt.dl.dr_data != NULL);
 		if (dr->dt.dl.dr_data != db->db_buf)
 			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
 	}
 
 	if (db->db_level != 0) {
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		arc_buf_t *buf = db->db_buf;
 
 		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
 		dbuf_set_data(db, NULL);
 		VERIFY(arc_buf_remove_ref(buf, db));
 		dbuf_evict(db);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 void
 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	DB_DNODE_ENTER(db);
 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
 	DB_DNODE_EXIT(db);
 	(void) dbuf_read(db, NULL, rf);
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_state = DB_NOFILL;
 
 	dmu_buf_will_fill(db_fake, tx);
 }
 
 void
 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 #pragma weak dmu_buf_fill_done = dbuf_fill_done
 /* ARGSUSED */
 void
 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			bzero(db->db.db_data, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
 		}
 		db->db_state = DB_CACHED;
 		cv_broadcast(&db->db_changed);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 void
 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	struct dirty_leaf *dl;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 	dl = &db->db_last_dirty->dt.dl;
 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
 	    data, comp, uncompressed_size, compressed_size);
 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
 	BP_SET_TYPE(&dl->dr_overridden_by, type);
 	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
 	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
 }
 
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
  */
 void
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db_level == 0);
 	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
 	ASSERT(buf != NULL);
 	ASSERT(arc_buf_size(buf) == db->db.db_size);
 	ASSERT(tx->tx_txg != 0);
 
 	arc_return_buf(buf, db);
 	ASSERT(arc_released(buf));
 
 	mutex_enter(&db->db_mtx);
 
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 
 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
 
 	if (db->db_state == DB_CACHED &&
 	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
 		VERIFY(arc_buf_remove_ref(buf, db));
 		xuio_stat_wbuf_copied();
 		return;
 	}
 
 	xuio_stat_wbuf_nocopy();
 	if (db->db_state == DB_CACHED) {
 		dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 		ASSERT(db->db_buf != NULL);
 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
 			if (!arc_released(db->db_buf)) {
 				ASSERT(dr->dt.dl.dr_override_state ==
 				    DR_OVERRIDDEN);
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
 		}
 		db->db_buf = NULL;
 	}
 	ASSERT(db->db_buf == NULL);
 	dbuf_set_data(db, buf);
 	db->db_state = DB_FILL;
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
 	dmu_buf_fill_done(&db->db, tx);
 }
 
 /*
  * "Clear" the contents of this dbuf.  This will mark the dbuf
  * EVICTING and clear *most* of its references.  Unfortunately,
  * when we are not holding the dn_dbufs_mtx, we can't clear the
  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
  * in this case.  For callers from the DMU we will usually see:
  *	dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
  * For the arc callback, we will usually see:
  *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
  * Sometimes, though, we will get a mix of these two:
  *	DMU: dbuf_clear()->arc_clear_callback()
  *	ARC: dbuf_do_evict()->dbuf_destroy()
  *
  * This routine will dissociate the dbuf from the arc, by calling
  * arc_clear_callback(), but will not evict the data from the ARC.
  */
 void
 dbuf_clear(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
 	dmu_buf_impl_t *dndb;
 	boolean_t dbuf_gone = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	dbuf_evict_user(db);
 
 	if (db->db_state == DB_CACHED) {
 		ASSERT(db->db.db_data != NULL);
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		}
 		db->db.db_data = NULL;
 		db->db_state = DB_UNCACHED;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 
 	db->db_state = DB_EVICTING;
 	db->db_blkptr = NULL;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dndb = dn->dn_dbuf;
 	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
 		avl_remove(&dn->dn_dbufs, db);
 		atomic_dec_32(&dn->dn_dbufs_count);
 		membar_producer();
 		DB_DNODE_EXIT(db);
 		/*
 		 * Decrementing the dbuf count means that the hold corresponding
 		 * to the removed dbuf is no longer discounted in dnode_move(),
 		 * so the dnode cannot be moved until after we release the hold.
 		 * The membar_producer() ensures visibility of the decremented
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
 		dnode_rele(dn, db);
 		db->db_dnode_handle = NULL;
 	} else {
 		DB_DNODE_EXIT(db);
 	}
 
 	if (db->db_buf)
 		dbuf_gone = arc_clear_callback(db->db_buf);
 
 	if (!dbuf_gone)
 		mutex_exit(&db->db_mtx);
 
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
 	if (parent && parent != dndb)
 		dbuf_rele(parent, db);
 }
 
 static int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
 {
 	int nlevels, epbs;
 
 	*parentp = NULL;
 	*bpp = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	if (blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_have_spill &&
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 			*bpp = &dn->dn_phys->dn_spill;
 		else
 			*bpp = NULL;
 		dbuf_add_ref(dn->dn_dbuf, NULL);
 		*parentp = dn->dn_dbuf;
 		mutex_exit(&dn->dn_mtx);
 		return (0);
 	}
 
 	if (dn->dn_phys->dn_nlevels == 0)
 		nlevels = 1;
 	else
 		nlevels = dn->dn_phys->dn_nlevels;
 
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	ASSERT3U(level * epbs, <, 64);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	if (level >= nlevels ||
 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
 		return (SET_ERROR(ENOENT));
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err = dbuf_hold_impl(dn, level+1,
 		    blkid >> epbs, fail_sparse, NULL, parentp);
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err) {
 			dbuf_rele(*parentp, NULL);
 			*parentp = NULL;
 			return (err);
 		}
 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
 		    (blkid & ((1ULL << epbs) - 1));
 		return (0);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
 		    blkid < dn->dn_phys->dn_nblkptr);
 		if (dn->dn_dbuf) {
 			dbuf_add_ref(dn->dn_dbuf, NULL);
 			*parentp = dn->dn_dbuf;
 		}
 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
 		return (0);
 	}
 }
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr)
 {
 	objset_t *os = dn->dn_objset;
 	dmu_buf_impl_t *db, *odb;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
 	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
 
 	db->db_objset = os;
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
 	db->db_last_dirty = NULL;
 	db->db_dirtycnt = 0;
 	db->db_dnode_handle = dn->dn_handle;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 
 	db->db_user_ptr = NULL;
-	db->db_user_data_ptr_ptr = NULL;
 	db->db_evict_func = NULL;
 	db->db_immediate_evict = 0;
 	db->db_freed_in_flight = 0;
 
 	if (blkid == DMU_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = DN_MAX_BONUSLEN -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 		return (db);
 	} else if (blkid == DMU_SPILL_BLKID) {
 		db->db.db_size = (blkptr != NULL) ?
 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
 		db->db.db_offset = 0;
 	} else {
 		int blocksize =
 		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
 
 	/*
 	 * Hold the dn_dbufs_mtx while we get the new dbuf
 	 * in the hash table *and* added to the dbufs list.
 	 * This prevents a possible deadlock with someone
 	 * trying to look up this dbuf before its added to the
 	 * dn_dbufs list.
 	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db->db_state = DB_EVICTING;
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
 		kmem_cache_free(dbuf_cache, db);
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return (odb);
 	}
 	avl_add(&dn->dn_dbufs, db);
 	if (db->db_level == 0 && db->db_blkid >=
 	    dn->dn_unlisted_l0_blkid)
 		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
 	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
 
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    refcount_count(&dn->dn_holds) > 0);
 	(void) refcount_add(&dn->dn_holds, db);
 	atomic_inc_32(&dn->dn_dbufs_count);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
 static int
 dbuf_do_evict(void *private)
 {
 	dmu_buf_impl_t *db = private;
 
 	if (!MUTEX_HELD(&db->db_mtx))
 		mutex_enter(&db->db_mtx);
 
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_state != DB_EVICTING) {
 		ASSERT(db->db_state == DB_CACHED);
 		DBUF_VERIFY(db);
 		db->db_buf = NULL;
 		dbuf_evict(db);
 	} else {
 		mutex_exit(&db->db_mtx);
 		dbuf_destroy(db);
 	}
 	return (0);
 }
 
 static void
 dbuf_destroy(dmu_buf_impl_t *db)
 {
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * If this dbuf is still on the dn_dbufs list,
 		 * remove it from that list.
 		 */
 		if (db->db_dnode_handle != NULL) {
 			dnode_t *dn;
 
 			DB_DNODE_ENTER(db);
 			dn = DB_DNODE(db);
 			mutex_enter(&dn->dn_dbufs_mtx);
 			avl_remove(&dn->dn_dbufs, db);
 			atomic_dec_32(&dn->dn_dbufs_count);
 			mutex_exit(&dn->dn_dbufs_mtx);
 			DB_DNODE_EXIT(db);
 			/*
 			 * Decrementing the dbuf count means that the hold
 			 * corresponding to the removed dbuf is no longer
 			 * discounted in dnode_move(), so the dnode cannot be
 			 * moved until after we release the hold.
 			 */
 			dnode_rele(dn, db);
 			db->db_dnode_handle = NULL;
 		}
 		dbuf_hash_remove(db);
 	}
 	db->db_parent = NULL;
 	db->db_buf = NULL;
 
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
 	kmem_cache_free(dbuf_cache, db);
 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 }
 
 void
 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
 {
 	dmu_buf_impl_t *db = NULL;
 	blkptr_t *bp = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	if (dnode_block_freed(dn, blkid))
 		return;
 
 	/* dbuf_find() returns with db_mtx held */
 	if (db = dbuf_find(dn, 0, blkid)) {
 		/*
 		 * This dbuf is already in the cache.  We assume that
 		 * it is already CACHED, or else about to be either
 		 * read or filled.
 		 */
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_phys_t zb;
 
 			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 			    dn->dn_object, 0, blkid);
 
 			(void) arc_read(NULL, dn->dn_objset->os_spa,
 			    bp, NULL, NULL, prio,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zb);
 		}
 		if (db)
 			dbuf_rele(db, NULL);
 	}
 }
 
 /*
  * Returns with db_holds incremented, and db_mtx not held.
  * Note: dn_struct_rwlock must be held.
  */
 int
 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
     void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
 top:
 	/* dbuf_find() returns with db_mtx held */
 	db = dbuf_find(dn, level, blkid);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
 		int err;
 
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
 			if (err == 0 && bp && BP_IS_HOLE(bp))
 				err = SET_ERROR(ENOENT);
 			if (err) {
 				if (parent)
 					dbuf_rele(parent, NULL);
 				return (err);
 			}
 		}
 		if (err && err != ENOENT)
 			return (err);
 		db = dbuf_create(dn, level, blkid, parent, bp);
 	}
 
 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
 		arc_buf_add_ref(db->db_buf, db);
 		if (db->db_buf->b_data == NULL) {
 			dbuf_clear(db);
 			if (parent) {
 				dbuf_rele(parent, NULL);
 				parent = NULL;
 			}
 			goto top;
 		}
 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
 	}
 
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
 	 * If this buffer is currently syncing out, and we are are
 	 * still referencing it from db_data, we need to make a copy
 	 * of it in case we decide we want to dirty it again in this txg.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_state == DB_CACHED && db->db_data_pending) {
 		dbuf_dirty_record_t *dr = db->db_data_pending;
 
 		if (dr->dt.dl.dr_data == db->db_buf) {
 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 			dbuf_set_data(db,
 			    arc_buf_alloc(dn->dn_objset->os_spa,
 			    db->db.db_size, db, type));
 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
 			    db->db.db_size);
 		}
 	}
 
 	(void) refcount_add(&db->db_holds, tag);
-	dbuf_update_data(db);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
 	if (parent)
 		dbuf_rele(parent, NULL);
 
 	ASSERT3P(DB_DNODE(db), ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
 	ASSERT3U(db->db_level, ==, level);
 	*dbp = db;
 
 	return (0);
 }
 
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
 void
 dbuf_create_bonus(dnode_t *dn)
 {
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
 }
 
 int
 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	if (db->db_blkid != DMU_SPILL_BLKID)
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
 	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
 	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dbuf_new_size(db, blksz, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	DB_DNODE_EXIT(db);
 
 	return (0);
 }
 
 void
 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds = refcount_add(&db->db_holds, tag);
 	ASSERT(holds > 1);
 }
 
 /*
  * If you call dbuf_rele() you had better not be referencing the dnode handle
  * unless you have some other direct or indirect hold on the dnode. (An indirect
  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
  * dnode's parent dbuf evicting its dnode handles.
  */
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	mutex_enter(&db->db_mtx);
 	dbuf_rele_and_unlock(db, tag);
 }
 
 void
 dmu_buf_rele(dmu_buf_t *db, void *tag)
 {
 	dbuf_rele((dmu_buf_impl_t *)db, tag);
 }
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.
  */
 void
 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
 	/*
 	 * Remove the reference to the dbuf before removing its hold on the
 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
 	 * buffer has a corresponding dnode hold.
 	 */
 	holds = refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
 	/*
 	 * We can't freeze indirects if there is a possibility that they
 	 * may be modified in the current syncing context.
 	 */
 	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
 		arc_buf_freeze(db->db_buf);
 
 	if (holds == db->db_dirtycnt &&
 	    db->db_level == 0 && db->db_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			mutex_exit(&db->db_mtx);
 
 			/*
 			 * If the dnode moves here, we cannot cross this barrier
 			 * until the move completes.
 			 */
 			DB_DNODE_ENTER(db);
 			atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count);
 			DB_DNODE_EXIT(db);
 			/*
 			 * The bonus buffer's dnode hold is no longer discounted
 			 * in dnode_move(). The dnode cannot move until after
 			 * the dnode_rele().
 			 */
 			dnode_rele(DB_DNODE(db), db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
 			 * dbuf with any data allocated from the ARC.
 			 */
 			ASSERT(db->db_state == DB_UNCACHED ||
 			    db->db_state == DB_NOFILL);
 			dbuf_evict(db);
 		} else if (arc_released(db->db_buf)) {
 			arc_buf_t *buf = db->db_buf;
 			/*
 			 * This dbuf has anonymous data associated with it.
 			 */
 			dbuf_set_data(db, NULL);
 			VERIFY(arc_buf_remove_ref(buf, db));
 			dbuf_evict(db);
 		} else {
 			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
 
 			/*
 			 * A dbuf will be eligible for eviction if either the
 			 * 'primarycache' property is set or a duplicate
 			 * copy of this buffer is already cached in the arc.
 			 *
 			 * In the case of the 'primarycache' a buffer
 			 * is considered for eviction if it matches the
 			 * criteria set in the property.
 			 *
 			 * To decide if our buffer is considered a
 			 * duplicate, we must call into the arc to determine
 			 * if multiple buffers are referencing the same
 			 * block on-disk. If so, then we simply evict
 			 * ourselves.
 			 */
 			if (!DBUF_IS_CACHEABLE(db)) {
 				if (db->db_blkptr != NULL &&
 				    !BP_IS_HOLE(db->db_blkptr) &&
 				    !BP_IS_EMBEDDED(db->db_blkptr)) {
 					spa_t *spa =
 					    dmu_objset_spa(db->db_objset);
 					blkptr_t bp = *db->db_blkptr;
 					dbuf_clear(db);
 					arc_freed(spa, &bp);
 				} else {
 					dbuf_clear(db);
 				}
 			} else if (arc_buf_eviction_needed(db->db_buf)) {
 				dbuf_clear(db);
 			} else {
 				mutex_exit(&db->db_mtx);
 			}
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
 uint64_t
 dbuf_refcount(dmu_buf_impl_t *db)
 {
 	return (refcount_count(&db->db_holds));
 }
 
 void *
-dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr,
     dmu_buf_evict_func_t *evict_func)
 {
-	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-	    user_data_ptr_ptr, evict_func));
+	return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func));
 }
 
 void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr,
     dmu_buf_evict_func_t *evict_func)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_immediate_evict = TRUE;
-	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-	    user_data_ptr_ptr, evict_func));
+	return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func));
 }
 
 void *
 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+    dmu_buf_evict_func_t *evict_func)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT(db->db_level == 0);
 
 	ASSERT((user_ptr == NULL) == (evict_func == NULL));
 
 	mutex_enter(&db->db_mtx);
 
 	if (db->db_user_ptr == old_user_ptr) {
 		db->db_user_ptr = user_ptr;
-		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
 		db->db_evict_func = evict_func;
-
-		dbuf_update_data(db);
 	} else {
 		old_user_ptr = db->db_user_ptr;
 	}
 
 	mutex_exit(&db->db_mtx);
 	return (old_user_ptr);
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	return (db->db_user_ptr);
 }
 
 boolean_t
 dmu_buf_freeable(dmu_buf_t *dbuf)
 {
 	boolean_t res = B_FALSE;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 
 	if (db->db_blkptr)
 		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
 		    db->db_blkptr, db->db_blkptr->blk_birth);
 
 	return (res);
 }
 
 blkptr_t *
 dmu_buf_get_blkptr(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_blkptr);
 }
 
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	/* ASSERT(dmu_tx_is_syncing(tx) */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_blkptr != NULL)
 		return;
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		db->db_blkptr = &dn->dn_phys->dn_spill;
 		BP_ZERO(db->db_blkptr);
 		return;
 	}
 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
 		/*
 		 * This buffer was allocated at a time when there was
 		 * no available blkptrs from the dnode, or it was
 		 * inappropriate to hook it in (i.e., nlevels mis-match).
 		 */
 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
 		ASSERT(db->db_parent == NULL);
 		db->db_parent = dn->dn_dbuf;
 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
 		DBUF_VERIFY(db);
 	} else {
 		dmu_buf_impl_t *parent = db->db_parent;
 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 		ASSERT(dn->dn_phys->dn_nlevels > 1);
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			(void) dbuf_hold_impl(dn, db->db_level+1,
 			    db->db_blkid >> epbs, FALSE, db, &parent);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
 		}
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
 		DBUF_VERIFY(db);
 	}
 }
 
 static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	zio_t *zio;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 
 	ASSERT(db->db_level > 0);
 	DBUF_VERIFY(db);
 
 	/* Read the block if it hasn't been read yet. */
 	if (db->db_buf == NULL) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 		mutex_enter(&db->db_mtx);
 	}
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 	ASSERT(db->db_buf != NULL);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/* Indirect block size must match what the dnode thinks it is. */
 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	dbuf_check_blkptr(dn, db);
 	DB_DNODE_EXIT(db);
 
 	/* Provide the pending dirty record to child dbufs */
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 	dbuf_write(dr, db->db_buf, tx);
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
 	dbuf_sync_list(&dr->dt.di.dr_children, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
 }
 
 static void
 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	objset_t *os;
 	uint64_t txg = tx->tx_txg;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * To be synced, we must be dirtied.  But we
 	 * might have been freed after the dirty.
 	 */
 	if (db->db_state == DB_UNCACHED) {
 		/* This buffer has been freed since it was dirtied */
 		ASSERT(db->db.db_data == NULL);
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
 	DBUF_VERIFY(db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	/*
 	 * If this is a bonus buffer, simply copy the bonus data into the
 	 * dnode.  It will be written out when the dnode is synced (and it
 	 * will be synced, since it must have been dirty for dbuf_sync to
 	 * be called).
 	 */
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		dbuf_dirty_record_t **drp;
 
 		ASSERT(*datap != NULL);
 		ASSERT0(db->db_level);
 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
 		DB_DNODE_EXIT(db);
 
 		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		}
 		db->db_data_pending = NULL;
 		drp = &db->db_last_dirty;
 		while (*drp != dr)
 			drp = &(*drp)->dr_next;
 		ASSERT(dr->dr_next == NULL);
 		ASSERT(dr->dr_dbuf == db);
 		*drp = dr->dr_next;
 		if (dr->dr_dbuf->db_level != 0) {
 			list_destroy(&dr->dt.di.dr_children);
 			mutex_destroy(&dr->dt.di.dr_mtx);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 		return;
 	}
 
 	os = dn->dn_objset;
 
 	/*
 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
 	 * operation to sneak in. As a result, we need to ensure that we
 	 * don't check the dr_override_state until we have returned from
 	 * dbuf_check_blkptr.
 	 */
 	dbuf_check_blkptr(dn, db);
 
 	/*
 	 * If this buffer is in the middle of an immediate write,
 	 * wait for the synchronous IO to complete.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		cv_wait(&db->db_changed, &db->db_mtx);
 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
 	}
 
 	if (db->db_state != DB_NOFILL &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    refcount_count(&db->db_holds) > 1 &&
 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
 	    *datap == db->db_buf) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
 		 * then make a copy before we start the write so that
 		 * any modifications from the open txg will not leak
 		 * into this write.
 		 *
 		 * NOTE: this copy does not need to be made for
 		 * objects only modified in the syncing context (e.g.
 		 * DNONE_DNODE blocks).
 		 */
 		int blksz = arc_buf_size(*datap);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
 		bcopy(db->db.db_data, (*datap)->b_data, blksz);
 	}
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
 		DB_DNODE_EXIT(db);
 	} else {
 		/*
 		 * Although zio_nowait() does not "wait for an IO", it does
 		 * initiate the IO. If this is an empty write it seems plausible
 		 * that the IO could actually be completed before the nowait
 		 * returns. We need to DB_DNODE_EXIT() first in case
 		 * zio_nowait() invalidates the dbuf.
 		 */
 		DB_DNODE_EXIT(db);
 		zio_nowait(dr->dr_zio);
 	}
 }
 
 void
 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
 	while (dr = list_head(list)) {
 		if (dr->dr_zio != NULL) {
 			/*
 			 * If we find an already initialized zio then we
 			 * are processing the meta-dnode, and we have finished.
 			 * The dbufs for all dnodes are put back on the list
 			 * during processing, so that we can zio_wait()
 			 * these IOs after initiating all child IOs.
 			 */
 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf->db_level > 0)
 			dbuf_sync_indirect(dr, tx);
 		else
 			dbuf_sync_leaf(dr, tx);
 	}
 }
 
 /* ARGSUSED */
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	spa_t *spa = zio->io_spa;
 	int64_t delta;
 	uint64_t fill = 0;
 	int i;
 
 	ASSERT3P(db->db_blkptr, ==, bp);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
 	if (bp->blk_birth != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
 		    BP_IS_EMBEDDED(bp));
 		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 	}
 #endif
 
 	if (db->db_level == 0) {
 		mutex_enter(&dn->dn_mtx);
 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
 		    db->db_blkid != DMU_SPILL_BLKID)
 			dn->dn_phys->dn_maxblkid = db->db_blkid;
 		mutex_exit(&dn->dn_mtx);
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			dnode_phys_t *dnp = db->db.db_data;
 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
 			    i--, dnp++) {
 				if (dnp->dn_type != DMU_OT_NONE)
 					fill++;
 			}
 		} else {
 			if (BP_IS_HOLE(bp)) {
 				fill = 0;
 			} else {
 				fill = 1;
 			}
 		}
 	} else {
 		blkptr_t *ibp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
 			fill += BP_GET_FILL(ibp);
 		}
 	}
 	DB_DNODE_EXIT(db);
 
 	if (!BP_IS_EMBEDDED(bp))
 		bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
 }
 
 /*
  * The SPA will call this callback several times for each zio - once
  * for every physical child i/o (zio->io_phys_children times).  This
  * allows the DMU to monitor the progress of each logical i/o.  For example,
  * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
  * block.  There may be a long delay before all copies/fragments are completed,
  * so this callback allows us to retire dirty space gradually, as the physical
  * i/os complete.
  */
 /* ARGSUSED */
 static void
 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	dmu_buf_impl_t *db = arg;
 	objset_t *os = db->db_objset;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 	dbuf_dirty_record_t *dr;
 	int delta = 0;
 
 	dr = db->db_data_pending;
 	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
 
 	/*
 	 * The callback will be called io_phys_children times.  Retire one
 	 * portion of our dirty space each time we are called.  Any rounding
 	 * error will be cleaned up by dsl_pool_sync()'s call to
 	 * dsl_pool_undirty_space().
 	 */
 	delta = dr->dr_accounted / zio->io_phys_children;
 	dsl_pool_undirty_space(dp, delta, zio->io_txg);
 }
 
 /* ARGSUSED */
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	blkptr_t *bp = db->db_blkptr;
 	objset_t *os = db->db_objset;
 	dmu_tx_t *tx = os->os_synctx;
 	dbuf_dirty_record_t **drp, *dr;
 
 	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
 	/*
 	 * For nopwrites and rewrites we ensure that the bp matches our
 	 * original and bypass all the accounting.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 	DBUF_VERIFY(db);
 
 	drp = &db->db_last_dirty;
 	while ((dr = *drp) != db->db_data_pending)
 		drp = &dr->dr_next;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	ASSERT(dr->dr_dbuf == db);
 	ASSERT(dr->dr_next == NULL);
 	*drp = dr->dr_next;
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 		DB_DNODE_EXIT(db);
 	}
 #endif
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != db->db_buf)
 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
 				    db));
 			else if (!arc_released(db->db_buf))
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 	} else {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
 			int epbs =
 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_blkid, <=,
 			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 			    db->db.db_size);
 			if (!arc_released(db->db_buf))
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 		DB_DNODE_EXIT(db);
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	cv_broadcast(&db->db_changed);
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
 }
 
 static void
 dbuf_write_nofill_ready(zio_t *zio)
 {
 	dbuf_write_ready(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_nofill_done(zio_t *zio)
 {
 	dbuf_write_done(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_override_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	dbuf_write_ready(zio, NULL, db);
 }
 
 static void
 dbuf_write_override_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
 
 	mutex_enter(&db->db_mtx);
 	if (!BP_EQUAL(zio->io_bp, obp)) {
 		if (!BP_IS_HOLE(obp))
 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
 		arc_release(dr->dt.dl.dr_data, db);
 	}
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write_done(zio, NULL, db);
 }
 
 /* Issue I/O to commit a dirty buffer to disk. */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	int wp_flag = 0;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	os = dn->dn_objset;
 
 	if (db->db_state != DB_NOFILL) {
 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
 			/*
 			 * Private object buffers are released here rather
 			 * than in dbuf_dirty() since they are only modified
 			 * in the syncing context and we don't want the
 			 * overhead of making multiple copies of the data.
 			 */
 			if (BP_IS_HOLE(db->db_blkptr)) {
 				arc_buf_thaw(data);
 			} else {
 				dbuf_release_bp(db);
 			}
 		}
 	}
 
 	if (parent != dn->dn_dbuf) {
 		/* Our parent is an indirect block. */
 		/* We have a dirty parent that has been scheduled for write. */
 		ASSERT(parent && parent->db_data_pending);
 		/* Our parent's buffer is one level closer to the dnode. */
 		ASSERT(db->db_level == parent->db_level-1);
 		/*
 		 * We're about to modify our parent's db_data by modifying
 		 * our block pointer, so the parent must be released.
 		 */
 		ASSERT(arc_released(parent->db_buf));
 		zio = parent->db_data_pending->dr_zio;
 	} else {
 		/* Our parent is the dnode itself. */
 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
 		    db->db_blkid != DMU_SPILL_BLKID) ||
 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
 		if (db->db_blkid != DMU_SPILL_BLKID)
 			ASSERT3P(db->db_blkptr, ==,
 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		zio = dn->dn_zio;
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
 	ASSERT(zio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 	DB_DNODE_EXIT(db);
 
 	if (db->db_level == 0 &&
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * The BP for this block has been provided by open context
 		 * (by dmu_sync() or dmu_buf_write_embedded()).
 		 */
 		void *contents = (data != NULL) ? data->b_data : NULL;
 
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, contents, db->db.db_size, &zp,
 		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, NULL, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
 		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
 		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
 		    dbuf_write_physdone, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c	(revision 275782)
@@ -1,251 +1,251 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
 
 struct diffarg {
 	struct file *da_fp;		/* file to which we are reporting */
 	offset_t *da_offp;
 	int da_err;			/* error that stopped diff search */
 	dmu_diff_record_t da_ddr;
 	kthread_t *da_td;
 };
 
 static int
 write_bytes(struct diffarg *da)
 {
 	struct uio auio;
 	struct iovec aiov;
 
 	aiov.iov_base = (caddr_t)&da->da_ddr;
 	aiov.iov_len = sizeof (da->da_ddr);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_offset = (off_t)-1;
 	auio.uio_td = da->da_td;
 #ifdef _KERNEL
 	if (da->da_fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 	return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td));
 #else
 	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
 	return (EOPNOTSUPP);
 #endif
 }
 
 static int
 write_record(struct diffarg *da)
 {
 
 	if (da->da_ddr.ddr_type == DDR_NONE) {
 		da->da_err = 0;
 		return (0);
 	}
 
 	da->da_err = write_bytes(da);
 	*da->da_offp += sizeof (da->da_ddr);
 	return (da->da_err);
 }
 
 static int
 report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last)
 {
 	ASSERT(first <= last);
 	if (da->da_ddr.ddr_type != DDR_FREE ||
 	    first != da->da_ddr.ddr_last + 1) {
 		if (write_record(da) != 0)
 			return (da->da_err);
 		da->da_ddr.ddr_type = DDR_FREE;
 		da->da_ddr.ddr_first = first;
 		da->da_ddr.ddr_last = last;
 		return (0);
 	}
 	da->da_ddr.ddr_last = last;
 	return (0);
 }
 
 static int
 report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp)
 {
 	ASSERT(dnp != NULL);
 	if (dnp->dn_type == DMU_OT_NONE)
 		return (report_free_dnode_range(da, object, object));
 
 	if (da->da_ddr.ddr_type != DDR_INUSE ||
 	    object != da->da_ddr.ddr_last + 1) {
 		if (write_record(da) != 0)
 			return (da->da_err);
 		da->da_ddr.ddr_type = DDR_INUSE;
 		da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
 		return (0);
 	}
 	da->da_ddr.ddr_last = object;
 	return (0);
 }
 
 #define	DBP_SPAN(dnp, level)				  \
 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
 
 /* ARGSUSED */
 static int
 diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct diffarg *da = arg;
 	int err = 0;
 
 	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (SET_ERROR(EINTR));
 
 	if (zb->zb_object != DMU_META_DNODE_OBJECT)
 		return (0);
 
 	if (BP_IS_HOLE(bp)) {
 		uint64_t span = DBP_SPAN(dnp, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 
 		err = report_free_dnode_range(da, dnobj,
 		    dnobj + (span >> DNODE_SHIFT) - 1);
 		if (err)
 			return (err);
 	} else if (zb->zb_level == 0) {
 		dnode_phys_t *blk;
 		arc_buf_t *abuf;
 		uint32_t aflags = ARC_WAIT;
 		int blksz = BP_GET_LSIZE(bp);
 		int i;
 
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
 		blk = abuf->b_data;
 		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
 			uint64_t dnobj = (zb->zb_blkid <<
 			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
 			err = report_dnode(da, dnobj, blk+i);
 			if (err)
 				break;
 		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
 		if (err)
 			return (err);
 		/* Don't care about the data blocks */
 		return (TRAVERSE_VISIT_NO_CHILDREN);
 	}
 	return (0);
 }
 
 int
 dmu_diff(const char *tosnap_name, const char *fromsnap_name,
 #ifdef illumos
     struct vnode *vp, offset_t *offp)
 #else
     struct file *fp, offset_t *offp)
 #endif
 {
 	struct diffarg da;
 	dsl_dataset_t *fromsnap;
 	dsl_dataset_t *tosnap;
 	dsl_pool_t *dp;
 	int error;
 	uint64_t fromtxg;
 
 	if (strchr(tosnap_name, '@') == NULL ||
 	    strchr(fromsnap_name, '@') == NULL)
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_pool_hold(tosnap_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap);
 	if (error != 0) {
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
 		dsl_dataset_rele(fromsnap, FTAG);
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
-	fromtxg = fromsnap->ds_phys->ds_creation_txg;
+	fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
 	dsl_dataset_rele(fromsnap, FTAG);
 
 	dsl_dataset_long_hold(tosnap, FTAG);
 	dsl_pool_rele(dp, FTAG);
 
 	da.da_fp = fp;
 	da.da_offp = offp;
 	da.da_ddr.ddr_type = DDR_NONE;
 	da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
 	da.da_err = 0;
 	da.da_td = curthread;
 
 	error = traverse_dataset(tosnap, fromtxg,
 	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
 
 	if (error != 0) {
 		da.da_err = error;
 	} else {
 		/* we set the da.da_err we return as side-effect */
 		(void) write_record(&da);
 	}
 
 	dsl_dataset_long_rele(tosnap, FTAG);
 	dsl_dataset_rele(tosnap, FTAG);
 
 	return (da.da_err);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	(revision 275782)
@@ -1,1825 +1,1829 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/cred.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dnode.h>
 #include <sys/dbuf.h>
 #include <sys/zvol.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
  * before it can be safely accessed.
  */
 krwlock_t os_lock;
 
 void
 dmu_objset_init(void)
 {
 	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
 }
 
 void
 dmu_objset_fini(void)
 {
 	rw_destroy(&os_lock);
 }
 
 spa_t *
 dmu_objset_spa(objset_t *os)
 {
 	return (os->os_spa);
 }
 
 zilog_t *
 dmu_objset_zil(objset_t *os)
 {
 	return (os->os_zil);
 }
 
 dsl_pool_t *
 dmu_objset_pool(objset_t *os)
 {
 	dsl_dataset_t *ds;
 
 	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
 		return (ds->ds_dir->dd_pool);
 	else
 		return (spa_get_dsl(os->os_spa));
 }
 
 dsl_dataset_t *
 dmu_objset_ds(objset_t *os)
 {
 	return (os->os_dsl_dataset);
 }
 
 dmu_objset_type_t
 dmu_objset_type(objset_t *os)
 {
 	return (os->os_phys->os_type);
 }
 
 void
 dmu_objset_name(objset_t *os, char *buf)
 {
 	dsl_dataset_name(os->os_dsl_dataset, buf);
 }
 
 uint64_t
 dmu_objset_id(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	return (ds ? ds->ds_object : 0);
 }
 
 zfs_sync_type_t
 dmu_objset_syncprop(objset_t *os)
 {
 	return (os->os_sync);
 }
 
 zfs_logbias_op_t
 dmu_objset_logbias(objset_t *os)
 {
 	return (os->os_logbias);
 }
 
 static void
 checksum_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 }
 
 static void
 compression_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
 	os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
 }
 
 static void
 copies_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval > 0);
 	ASSERT(newval <= spa_max_replication(os->os_spa));
 
 	os->os_copies = newval;
 }
 
 static void
 dedup_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 	spa_t *spa = os->os_spa;
 	enum zio_checksum checksum;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
 
 	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
 	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 }
 
 static void
 primary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_primary_cache = newval;
 }
 
 static void
 secondary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_secondary_cache = newval;
 }
 
 static void
 sync_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
 	    newval == ZFS_SYNC_DISABLED);
 
 	os->os_sync = newval;
 	if (os->os_zil)
 		zil_set_sync(os->os_zil, newval);
 }
 
 static void
 redundant_metadata_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
 	    newval == ZFS_REDUNDANT_METADATA_MOST);
 
 	os->os_redundant_metadata = newval;
 }
 
 static void
 logbias_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
 	    newval == ZFS_LOGBIAS_THROUGHPUT);
 	os->os_logbias = newval;
 	if (os->os_zil)
 		zil_set_logbias(os->os_zil, newval);
 }
 
 static void
 recordsize_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	os->os_recordsize = newval;
 }
 
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
 	objset_phys_t *osp = buf;
 
 	ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
 	dnode_byteswap(&osp->os_meta_dnode);
 	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 	osp->os_type = BSWAP_64(osp->os_type);
 	osp->os_flags = BSWAP_64(osp->os_flags);
 	if (size == sizeof (objset_phys_t)) {
 		dnode_byteswap(&osp->os_userused_dnode);
 		dnode_byteswap(&osp->os_groupused_dnode);
 	}
 }
 
 int
 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     objset_t **osp)
 {
 	objset_t *os;
 	int i, err;
 
 	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
 	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
 	os->os_dsl_dataset = ds;
 	os->os_spa = spa;
 	os->os_rootbp = bp;
 	if (!BP_IS_HOLE(os->os_rootbp)) {
 		uint32_t aflags = ARC_WAIT;
 		zbookmark_phys_t zb;
 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 		if (DMU_OS_IS_L2CACHEABLE(os))
 			aflags |= ARC_L2CACHE;
 		if (DMU_OS_IS_L2COMPRESSIBLE(os))
 			aflags |= ARC_L2COMPRESS;
 
 		dprintf_bp(os->os_rootbp, "reading %s", "");
 		err = arc_read(NULL, spa, os->os_rootbp,
 		    arc_getbuf_func, &os->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 		if (err != 0) {
 			kmem_free(os, sizeof (objset_t));
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = SET_ERROR(EIO);
 			return (err);
 		}
 
 		/* Increase the blocksize if we are permitted. */
 		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
 			arc_buf_t *buf = arc_buf_alloc(spa,
 			    sizeof (objset_phys_t), &os->os_phys_buf,
 			    ARC_BUFC_METADATA);
 			bzero(buf->b_data, sizeof (objset_phys_t));
 			bcopy(os->os_phys_buf->b_data, buf->b_data,
 			    arc_buf_size(os->os_phys_buf));
 			(void) arc_buf_remove_ref(os->os_phys_buf,
 			    &os->os_phys_buf);
 			os->os_phys_buf = buf;
 		}
 
 		os->os_phys = os->os_phys_buf->b_data;
 		os->os_flags = os->os_phys->os_flags;
 	} else {
 		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
 		os->os_phys_buf = arc_buf_alloc(spa, size,
 		    &os->os_phys_buf, ARC_BUFC_METADATA);
 		os->os_phys = os->os_phys_buf->b_data;
 		bzero(os->os_phys, size);
 	}
 
 	/*
 	 * Note: the changed_cb will be called once before the register
 	 * func returns, thus changing the checksum/compression from the
 	 * default (fletcher2/off).  Snapshots don't need to know about
 	 * checksum/compression/copies.
 	 */
 	if (ds != NULL) {
 		err = dsl_prop_register(ds,
 		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os);
 		if (err == 0) {
 			err = dsl_prop_register(ds,
 			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 			    secondary_cache_changed_cb, os);
 		}
 		if (!dsl_dataset_is_snapshot(ds)) {
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 				    checksum_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    compression_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_COPIES),
 				    copies_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_DEDUP),
 				    dedup_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 				    logbias_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_SYNC),
 				    sync_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(
 				    ZFS_PROP_REDUNDANT_METADATA),
 				    redundant_metadata_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 				    recordsize_changed_cb, os);
 			}
 		}
 		if (err != 0) {
 			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
 			    &os->os_phys_buf));
 			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
 	} else {
 		/* It's the meta-objset. */
 		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
 		os->os_compress = ZIO_COMPRESS_LZJB;
 		os->os_copies = spa_max_replication(spa);
 		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
 		os->os_dedup_verify = B_FALSE;
 		os->os_logbias = ZFS_LOGBIAS_LATENCY;
 		os->os_sync = ZFS_SYNC_STANDARD;
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
 
 	if (ds == NULL || !dsl_dataset_is_snapshot(ds))
 		os->os_zil_header = os->os_phys->os_zil_header;
 	os->os_zil = zil_alloc(os, &os->os_zil_header);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]));
 		list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]));
 	}
 	list_create(&os->os_dnodes, sizeof (dnode_t),
 	    offsetof(dnode_t, dn_link));
 	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	DMU_META_DNODE(os) = dnode_special_open(os,
 	    &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
 	    &os->os_meta_dnode);
 	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
 		DMU_USERUSED_DNODE(os) = dnode_special_open(os,
 		    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
 		    &os->os_userused_dnode);
 		DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
 		    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
 		    &os->os_groupused_dnode);
 	}
 
 	*osp = os;
 	return (0);
 }
 
 int
 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 {
 	int err = 0;
 
 	mutex_enter(&ds->ds_opening_lock);
 	if (ds->ds_objset == NULL) {
 		objset_t *os;
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
 		    ds, dsl_dataset_get_blkptr(ds), &os);
 
 		if (err == 0) {
 			mutex_enter(&ds->ds_lock);
 			ASSERT(ds->ds_objset == NULL);
 			ds->ds_objset = os;
 			mutex_exit(&ds->ds_lock);
 		}
 	}
 	*osp = ds->ds_objset;
 	mutex_exit(&ds->ds_opening_lock);
 	return (err);
 }
 
 /*
  * Holds the pool while the objset is held.  Therefore only one objset
  * can be held at a time.
  */
 int
 dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_pool_hold(name, tag, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_hold(dp, name, tag, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
 	}
 
 	err = dmu_objset_from_ds(ds, osp);
 	if (err != 0) {
 		dsl_dataset_rele(ds, tag);
 		dsl_pool_rele(dp, tag);
 	}
 
 	return (err);
 }
 
 /*
  * dsl_pool must not be held when this is called.
  * Upon successful return, there will be a longhold on the dataset,
  * and the dsl_pool will not be held.
  */
 int
 dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_pool_hold(name, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_own(dp, name, tag, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
 	err = dmu_objset_from_ds(ds, osp);
 	dsl_pool_rele(dp, FTAG);
 	if (err != 0) {
 		dsl_dataset_disown(ds, tag);
 	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 		dsl_dataset_disown(ds, tag);
 		return (SET_ERROR(EINVAL));
 	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
 		dsl_dataset_disown(ds, tag);
 		return (SET_ERROR(EROFS));
 	}
 	return (err);
 }
 
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
 	dsl_pool_t *dp = dmu_objset_pool(os);
 	dsl_dataset_rele(os->os_dsl_dataset, tag);
 	dsl_pool_rele(dp, tag);
 }
 
 /*
  * When we are called, os MUST refer to an objset associated with a dataset
  * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
  * == tag.  We will then release and reacquire ownership of the dataset while
  * holding the pool config_rwlock to avoid intervening namespace or ownership
  * changes may occur.
  *
  * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
  * release the hold on its dataset and acquire a new one on the dataset of the
  * same name so that it can be partially torn down and reconstructed.
  */
 void
 dmu_objset_refresh_ownership(objset_t *os, void *tag)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds, *newds;
 	char name[MAXNAMELEN];
 
 	ds = os->os_dsl_dataset;
 	VERIFY3P(ds, !=, NULL);
 	VERIFY3P(ds->ds_owner, ==, tag);
 	VERIFY(dsl_dataset_long_held(ds));
 
 	dsl_dataset_name(ds, name);
 	dp = dmu_objset_pool(os);
 	dsl_pool_config_enter(dp, FTAG);
 	dmu_objset_disown(os, tag);
 	VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
 	VERIFY3P(newds, ==, os->os_dsl_dataset);
 	dsl_pool_config_exit(dp, FTAG);
 }
 
 void
 dmu_objset_disown(objset_t *os, void *tag)
 {
 	dsl_dataset_disown(os->os_dsl_dataset, tag);
 }
 
 void
 dmu_objset_evict_dbufs(objset_t *os)
 {
 	dnode_t *dn;
 
 	mutex_enter(&os->os_lock);
 
 	/* process the mdn last, since the other dnodes have holds on it */
 	list_remove(&os->os_dnodes, DMU_META_DNODE(os));
 	list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
 
 	/*
 	 * Find the first dnode with holds.  We have to do this dance
 	 * because dnode_add_ref() only works if you already have a
 	 * hold.  If there are no holds then it has no dbufs so OK to
 	 * skip.
 	 */
 	for (dn = list_head(&os->os_dnodes);
 	    dn && !dnode_add_ref(dn, FTAG);
 	    dn = list_next(&os->os_dnodes, dn))
 		continue;
 
 	while (dn) {
 		dnode_t *next_dn = dn;
 
 		do {
 			next_dn = list_next(&os->os_dnodes, next_dn);
 		} while (next_dn && !dnode_add_ref(next_dn, FTAG));
 
 		mutex_exit(&os->os_lock);
 		dnode_evict_dbufs(dn);
 		dnode_rele(dn, FTAG);
 		mutex_enter(&os->os_lock);
 		dn = next_dn;
 	}
 	mutex_exit(&os->os_lock);
 }
 
 void
 dmu_objset_evict(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!dmu_objset_is_dirty(os, t));
 
 	if (ds) {
 		if (!dsl_dataset_is_snapshot(ds)) {
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 			    checksum_changed_cb, os));
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 			    compression_changed_cb, os));
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_COPIES),
 			    copies_changed_cb, os));
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP),
 			    dedup_changed_cb, os));
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 			    logbias_changed_cb, os));
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_SYNC),
 			    sync_changed_cb, os));
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
 			    redundant_metadata_changed_cb, os));
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 			    recordsize_changed_cb, os));
 		}
 		VERIFY0(dsl_prop_unregister(ds,
 		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os));
 		VERIFY0(dsl_prop_unregister(ds,
 		    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 		    secondary_cache_changed_cb, os));
 	}
 
 	if (os->os_sa)
 		sa_tear_down(os);
 
 	dmu_objset_evict_dbufs(os);
 
 	dnode_special_close(&os->os_meta_dnode);
 	if (DMU_USERUSED_DNODE(os)) {
 		dnode_special_close(&os->os_userused_dnode);
 		dnode_special_close(&os->os_groupused_dnode);
 	}
 	zil_free(os->os_zil);
 
 	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
 	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 
 	/*
 	 * This is a barrier to prevent the objset from going away in
 	 * dnode_move() until we can safely ensure that the objset is still in
 	 * use. We consider the objset valid before the barrier and invalid
 	 * after the barrier.
 	 */
 	rw_enter(&os_lock, RW_READER);
 	rw_exit(&os_lock);
 
 	mutex_destroy(&os->os_lock);
 	mutex_destroy(&os->os_obj_lock);
 	mutex_destroy(&os->os_user_ptr_lock);
 	kmem_free(os, sizeof (objset_t));
 }
 
 timestruc_t
 dmu_objset_snap_cmtime(objset_t *os)
 {
 	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 }
 
 /* called from dsl for meta-objset */
 objset_t *
 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_objset_type_t type, dmu_tx_t *tx)
 {
 	objset_t *os;
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	if (ds != NULL)
 		VERIFY0(dmu_objset_from_ds(ds, &os));
 	else
 		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
 
 	mdn = DMU_META_DNODE(os);
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
 	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
 
 	/*
 	 * We don't want to have to increase the meta-dnode's nlevels
 	 * later, because then we could do it in quescing context while
 	 * we are also accessing it in open context.
 	 *
 	 * This precaution is not necessary for the MOS (ds == NULL),
 	 * because the MOS is only updated in syncing context.
 	 * This is most fortunate: the MOS is the only objset that
 	 * needs to be synced multiple times as spa_sync() iterates
 	 * to convergence, so minimizing its dn_nlevels matters.
 	 */
 	if (ds != NULL) {
 		int levels = 1;
 
 		/*
 		 * Determine the number of levels necessary for the meta-dnode
 		 * to contain DN_MAX_OBJECT dnodes.
 		 */
 		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
 		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
 		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
 			levels++;
 
 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
 		    mdn->dn_nlevels = levels;
 	}
 
 	ASSERT(type != DMU_OST_NONE);
 	ASSERT(type != DMU_OST_ANY);
 	ASSERT(type < DMU_OST_NUMTYPES);
 	os->os_phys->os_type = type;
 	if (dmu_objset_userused_enabled(os)) {
 		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 		os->os_flags = os->os_phys->os_flags;
 	}
 
 	dsl_dataset_dirty(ds, tx);
 
 	return (os);
 }
 
 typedef struct dmu_objset_create_arg {
 	const char *doca_name;
 	cred_t *doca_cred;
 	void (*doca_userfunc)(objset_t *os, void *arg,
 	    cred_t *cr, dmu_tx_t *tx);
 	void *doca_userarg;
 	dmu_objset_type_t doca_type;
 	uint64_t doca_flags;
 } dmu_objset_create_arg_t;
 
 /*ARGSUSED*/
 static int
 dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_create_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	const char *tail;
 	int error;
 
 	if (strchr(doca->doca_name, '@') != NULL)
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
 	if (tail == NULL) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 	    doca->doca_cred);
 	dsl_dir_rele(pdd, FTAG);
 
 	return (error);
 }
 
 static void
 dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_create_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	const char *tail;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 	blkptr_t *bp;
 	objset_t *os;
 
 	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 
 	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
 	    doca->doca_cred, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 	bp = dsl_dataset_get_blkptr(ds);
 	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
 	    ds, bp, doca->doca_type, tx);
 
 	if (doca->doca_userfunc != NULL) {
 		doca->doca_userfunc(os, doca->doca_userarg,
 		    doca->doca_cred, tx);
 	}
 
 	spa_history_log_internal_ds(ds, "create", tx, "");
 	dsl_dataset_rele(ds, FTAG);
 	dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
 	dmu_objset_create_arg_t doca;
 
 	doca.doca_name = name;
 	doca.doca_cred = CRED();
 	doca.doca_flags = flags;
 	doca.doca_userfunc = func;
 	doca.doca_userarg = arg;
 	doca.doca_type = type;
 
 	return (dsl_sync_task(name,
 	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
 	    5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 typedef struct dmu_objset_clone_arg {
 	const char *doca_clone;
 	const char *doca_origin;
 	cred_t *doca_cred;
 } dmu_objset_clone_arg_t;
 
 /*ARGSUSED*/
 static int
 dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_clone_arg_t *doca = arg;
 	dsl_dir_t *pdd;
 	const char *tail;
 	int error;
 	dsl_dataset_t *origin;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	if (strchr(doca->doca_clone, '@') != NULL)
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
 	if (tail == NULL) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 	/* You can't clone across pools. */
 	if (pdd->dd_pool != dp) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 	    doca->doca_cred);
 	if (error != 0) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 	dsl_dir_rele(pdd, FTAG);
 
 	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
 	if (error != 0)
 		return (error);
 
 	/* You can't clone across pools. */
 	if (origin->ds_dir->dd_pool != dp) {
 		dsl_dataset_rele(origin, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/* You can only clone snapshots, not the head datasets. */
 	if (!dsl_dataset_is_snapshot(origin)) {
 		dsl_dataset_rele(origin, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	dsl_dataset_rele(origin, FTAG);
 
 	return (0);
 }
 
 static void
 dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_clone_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	const char *tail;
 	dsl_dataset_t *origin, *ds;
 	uint64_t obj;
 	char namebuf[MAXNAMELEN];
 
 	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
 	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
 
 	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
 	    doca->doca_cred, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 	dsl_dataset_name(origin, namebuf);
 	spa_history_log_internal_ds(ds, "clone", tx,
 	    "origin=%s (%llu)", namebuf, origin->ds_object);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_dataset_rele(origin, FTAG);
 	dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_clone(const char *clone, const char *origin)
 {
 	dmu_objset_clone_arg_t doca;
 
 	doca.doca_clone = clone;
 	doca.doca_origin = origin;
 	doca.doca_cred = CRED();
 
 	return (dsl_sync_task(clone,
 	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
 	    5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
 dmu_objset_snapshot_one(const char *fsname, const char *snapname)
 {
 	int err;
 	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
 	nvlist_t *snaps = fnvlist_alloc();
 
 	fnvlist_add_boolean(snaps, longsnap);
 	strfree(longsnap);
 	err = dsl_dataset_snapshot(snaps, NULL, NULL);
 	fnvlist_free(snaps);
 	return (err);
 }
 
 static void
 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	while (dn = list_head(list)) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		ASSERT(dn->dn_dbuf->db_data_pending);
 		/*
 		 * Initialize dn_zio outside dnode_sync() because the
 		 * meta-dnode needs to set it ouside dnode_sync().
 		 */
 		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
 		ASSERT(dn->dn_zio);
 
 		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
 		list_remove(list, dn);
 
 		if (newlist) {
 			(void) dnode_add_ref(dn, newlist);
 			list_insert_tail(newlist, dn);
 		}
 
 		dnode_sync(dn, tx);
 	}
 }
 
 /* ARGSUSED */
 static void
 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 	ASSERT3P(bp, ==, os->os_rootbp);
 	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
 	ASSERT0(BP_GET_LEVEL(bp));
 
 	/*
 	 * Update rootbp fill count: it should be the number of objects
 	 * allocated in the object set (not counting the "special"
 	 * objects that are stored in the objset_phys_t -- the meta
 	 * dnode and user/group accounting objects).
 	 */
 	bp->blk_fill = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 /* ARGSUSED */
 static void
 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	objset_t *os = arg;
 
 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		dmu_tx_t *tx = os->os_synctx;
 
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 }
 
 /* called from dsl */
 void
 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	list_t *list;
 	list_t *newlist = NULL;
 	dbuf_dirty_record_t *dr;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* XXX the write_done callback should really give us the tx... */
 	os->os_synctx = tx;
 
 	if (os->os_dsl_dataset == NULL) {
 		/*
 		 * This is the MOS.  If we have upgraded,
 		 * spa_max_replication() could change, so reset
 		 * os_copies here.
 		 */
 		os->os_copies = spa_max_replication(os->os_spa);
 	}
 
 	/*
 	 * Create the root block IO
 	 */
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
 
 	dmu_write_policy(os, NULL, 0, 0, &zp);
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
 	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
 	    DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
 	    NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
 	    ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
 	 * Sync special dnodes - the parent IO for the sync is the root block
 	 */
 	DMU_META_DNODE(os)->dn_zio = zio;
 	dnode_sync(DMU_META_DNODE(os), tx);
 
 	os->os_phys->os_flags = os->os_flags;
 
 	if (DMU_USERUSED_DNODE(os) &&
 	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
 		DMU_USERUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_USERUSED_DNODE(os), tx);
 		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
 	}
 
 	txgoff = tx->tx_txg & TXG_MASK;
 
 	if (dmu_objset_userused_enabled(os)) {
 		newlist = &os->os_synced_dnodes;
 		/*
 		 * We must create the list here because it uses the
 		 * dn_dirty_link[] of this txg.
 		 */
 		list_create(newlist, sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[txgoff]));
 	}
 
 	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
 	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
 
 	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
 	while (dr = list_head(list)) {
 		ASSERT0(dr->dr_dbuf->db_level);
 		list_remove(list, dr);
 		if (dr->dr_zio)
 			zio_nowait(dr->dr_zio);
 	}
 	/*
 	 * Free intent log blocks up to this tx.
 	 */
 	zil_sync(os->os_zil, tx);
 	os->os_phys->os_zil_header = os->os_zil_header;
 	zio_nowait(zio);
 }
 
 boolean_t
 dmu_objset_is_dirty(objset_t *os, uint64_t txg)
 {
 	return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
 	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
 }
 
 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
 
 void
 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
 {
 	used_cbs[ost] = cb;
 }
 
 boolean_t
 dmu_objset_userused_enabled(objset_t *os)
 {
 	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
 	    used_cbs[os->os_phys->os_type] != NULL &&
 	    DMU_USERUSED_DNODE(os) != NULL);
 }
 
 static void
 do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
     uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
 {
 	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
 		int64_t delta = DNODE_SIZE + used;
 		if (subtract)
 			delta = -delta;
 		VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
 		    user, delta, tx));
 		VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
 		    group, delta, tx));
 	}
 }
 
 void
 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	list_t *list = &os->os_synced_dnodes;
 
 	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
 	while (dn = list_head(list)) {
 		int flags;
 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
 		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
 		    dn->dn_phys->dn_flags &
 		    DNODE_FLAG_USERUSED_ACCOUNTED);
 
 		/* Allocate the user/groupused objects if necessary. */
 		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
 			VERIFY(0 == zap_create_claim(os,
 			    DMU_USERUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 			VERIFY(0 == zap_create_claim(os,
 			    DMU_GROUPUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 		}
 
 		/*
 		 * We intentionally modify the zap object even if the
 		 * net delta is zero.  Otherwise
 		 * the block of the zap obj could be shared between
 		 * datasets but need to be different between them after
 		 * a bprewrite.
 		 */
 
 		flags = dn->dn_id_flags;
 		ASSERT(flags);
 		if (flags & DN_ID_OLD_EXIST)  {
 			do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
 			    dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
 		}
 		if (flags & DN_ID_NEW_EXIST) {
 			do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
 			    dn->dn_phys->dn_flags,  dn->dn_newuid,
 			    dn->dn_newgid, B_FALSE, tx);
 		}
 
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_oldused = 0;
 		dn->dn_oldflags = 0;
 		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
 			dn->dn_olduid = dn->dn_newuid;
 			dn->dn_oldgid = dn->dn_newgid;
 			dn->dn_id_flags |= DN_ID_OLD_EXIST;
 			if (dn->dn_bonuslen == 0)
 				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 			else
 				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		}
 		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
 		mutex_exit(&dn->dn_mtx);
 
 		list_remove(list, dn);
 		dnode_rele(dn, list);
 	}
 }
 
 /*
  * Returns a pointer to data to find uid/gid from
  *
  * If a dirty record for transaction group that is syncing can't
  * be found then NULL is returned.  In the NULL case it is assumed
  * the uid/gid aren't changing.
  */
 static void *
 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr, **drp;
 	void *data;
 
 	if (db->db_dirtycnt == 0)
 		return (db->db.db_data);  /* Nothing is changing */
 
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg == tx->tx_txg)
 			break;
 
 	if (dr == NULL) {
 		data = NULL;
 	} else {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(dr->dr_dbuf);
 		dn = DB_DNODE(dr->dr_dbuf);
 
 		if (dn->dn_bonuslen == 0 &&
 		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
 			data = dr->dt.dl.dr_data->b_data;
 		else
 			data = dr->dt.dl.dr_data;
 
 		DB_DNODE_EXIT(dr->dr_dbuf);
 	}
 
 	return (data);
 }
 
 void
 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	void *data = NULL;
 	dmu_buf_impl_t *db = NULL;
 	uint64_t *user = NULL;
 	uint64_t *group = NULL;
 	int flags = dn->dn_id_flags;
 	int error;
 	boolean_t have_spill = B_FALSE;
 
 	if (!dmu_objset_userused_enabled(dn->dn_objset))
 		return;
 
 	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
 	    DN_ID_CHKED_SPILL)))
 		return;
 
 	if (before && dn->dn_bonuslen != 0)
 		data = DN_BONUS(dn->dn_phys);
 	else if (!before && dn->dn_bonuslen != 0) {
 		if (dn->dn_bonus) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 			data = dmu_objset_userquota_find_data(db, tx);
 		} else {
 			data = DN_BONUS(dn->dn_phys);
 		}
 	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
 			int rf = 0;
 
 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
 				rf |= DB_RF_HAVESTRUCT;
 			error = dmu_spill_hold_by_dnode(dn,
 			    rf | DB_RF_MUST_SUCCEED,
 			    FTAG, (dmu_buf_t **)&db);
 			ASSERT(error == 0);
 			mutex_enter(&db->db_mtx);
 			data = (before) ? db->db.db_data :
 			    dmu_objset_userquota_find_data(db, tx);
 			have_spill = B_TRUE;
 	} else {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		mutex_exit(&dn->dn_mtx);
 		return;
 	}
 
 	if (before) {
 		ASSERT(data);
 		user = &dn->dn_olduid;
 		group = &dn->dn_oldgid;
 	} else if (data) {
 		user = &dn->dn_newuid;
 		group = &dn->dn_newgid;
 	}
 
 	/*
 	 * Must always call the callback in case the object
 	 * type has changed and that type isn't an object type to track
 	 */
 	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
 	    user, group);
 
 	/*
 	 * Preserve existing uid/gid when the callback can't determine
 	 * what the new uid/gid are and the callback returned EEXIST.
 	 * The EEXIST error tells us to just use the existing uid/gid.
 	 * If we don't know what the old values are then just assign
 	 * them to 0, since that is a new file  being created.
 	 */
 	if (!before && data == NULL && error == EEXIST) {
 		if (flags & DN_ID_OLD_EXIST) {
 			dn->dn_newuid = dn->dn_olduid;
 			dn->dn_newgid = dn->dn_oldgid;
 		} else {
 			dn->dn_newuid = 0;
 			dn->dn_newgid = 0;
 		}
 		error = 0;
 	}
 
 	if (db)
 		mutex_exit(&db->db_mtx);
 
 	mutex_enter(&dn->dn_mtx);
 	if (error == 0 && before)
 		dn->dn_id_flags |= DN_ID_OLD_EXIST;
 	if (error == 0 && !before)
 		dn->dn_id_flags |= DN_ID_NEW_EXIST;
 
 	if (have_spill) {
 		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 	} else {
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 	}
 	mutex_exit(&dn->dn_mtx);
 	if (have_spill)
 		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 }
 
 boolean_t
 dmu_objset_userspace_present(objset_t *os)
 {
 	return (os->os_phys->os_flags &
 	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 }
 
 int
 dmu_objset_userspace_upgrade(objset_t *os)
 {
 	uint64_t obj;
 	int err = 0;
 
 	if (dmu_objset_userspace_present(os))
 		return (0);
 	if (!dmu_objset_userused_enabled(os))
 		return (SET_ERROR(ENOTSUP));
 	if (dmu_objset_is_snapshot(os))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * We simply need to mark every object dirty, so that it will be
 	 * synced out and now accounted.  If this is called
 	 * concurrently, or if we already did some work before crashing,
 	 * that's fine, since we track each object's accounted state
 	 * independently.
 	 */
 
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
 		dmu_tx_t *tx;
 		dmu_buf_t *db;
 		int objerr;
 
 		if (issig(JUSTLOOKING) && issig(FORREAL))
 			return (SET_ERROR(EINTR));
 
 		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
 		if (objerr != 0)
 			continue;
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, obj);
 		objerr = dmu_tx_assign(tx, TXG_WAIT);
 		if (objerr != 0) {
 			dmu_tx_abort(tx);
 			continue;
 		}
 		dmu_buf_will_dirty(db, tx);
 		dmu_buf_rele(db, FTAG);
 		dmu_tx_commit(tx);
 	}
 
 	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 	txg_wait_synced(dmu_objset_pool(os), 0);
 	return (0);
 }
 
 void
 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
 	    usedobjsp, availobjsp);
 }
 
 uint64_t
 dmu_objset_fsid_guid(objset_t *os)
 {
 	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
 }
 
 void
 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
 {
 	stat->dds_type = os->os_phys->os_type;
 	if (os->os_dsl_dataset)
 		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
 }
 
 void
 dmu_objset_stats(objset_t *os, nvlist_t *nv)
 {
 	ASSERT(os->os_dsl_dataset ||
 	    os->os_phys->os_type == DMU_OST_META);
 
 	if (os->os_dsl_dataset != NULL)
 		dsl_dataset_stats(os->os_dsl_dataset, nv);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
 	    os->os_phys->os_type);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
 	    dmu_objset_userspace_present(os));
 }
 
 int
 dmu_objset_is_snapshot(objset_t *os)
 {
 	if (os->os_dsl_dataset != NULL)
 		return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
 	else
 		return (B_FALSE);
 }
 
 int
 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
     boolean_t *conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	uint64_t ignored;
 
-	if (ds->ds_phys->ds_snapnames_zapobj == 0)
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
-	    real, maxlen, conflict));
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
+	    MT_FIRST, real, maxlen, conflict));
 }
 
 int
 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
 	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
 
-	if (ds->ds_phys->ds_snapnames_zapobj == 0)
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, *offp);
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strcpy(name, attr.za_name);
 	if (idp)
 		*idp = attr.za_first_integer;
 	if (case_conflict)
 		*case_conflict = attr.za_normalization_conflict;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 
 	return (0);
 }
 
 int
 dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp)
 {
 	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
 	/* there is no next dir on a snapshot! */
 	if (os->os_dsl_dataset->ds_object !=
-	    dd->dd_phys->dd_head_dataset_obj)
+	    dsl_dir_phys(dd)->dd_head_dataset_obj)
 		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    dd->dd_pool->dp_meta_objset,
-	    dd->dd_phys->dd_child_dir_zapobj, *offp);
+	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strcpy(name, attr.za_name);
 	if (idp)
 		*idp = attr.za_first_integer;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 
 	return (0);
 }
 
 /*
  * Find objsets under and including ddobj, call func(ds) on each.
  */
 int
 dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
     int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
 {
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	uint64_t thisobj;
 	int err;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
 	if (err != 0)
 		return (err);
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
 		return (0);
 	}
 
-	thisobj = dd->dd_phys->dd_head_dataset_obj;
+	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj);
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
 			err = dmu_objset_find_dp(dp, attr->za_first_integer,
 			    func, arg, flags);
 			if (err != 0)
 				break;
 		}
 		zap_cursor_fini(&zc);
 
 		if (err != 0) {
 			dsl_dir_rele(dd, FTAG);
 			kmem_free(attr, sizeof (zap_attribute_t));
 			return (err);
 		}
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
 	if (flags & DS_FIND_SNAPSHOTS) {
 		dsl_dataset_t *ds;
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
-			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+			uint64_t snapobj;
+
+			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
 				ASSERT3U(attr->za_num_integers, ==, 1);
 
 				err = dsl_dataset_hold_obj(dp,
 				    attr->za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
 				err = func(dp, ds, arg);
 				dsl_dataset_rele(ds, FTAG);
 				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
 	dsl_dir_rele(dd, FTAG);
 	kmem_free(attr, sizeof (zap_attribute_t));
 
 	if (err != 0)
 		return (err);
 
 	/*
 	 * Apply to self.
 	 */
 	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 	if (err != 0)
 		return (err);
 	err = func(dp, ds, arg);
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 /*
  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
  * The dp_config_rwlock must not be held when this is called, and it
  * will not be held when the callback is called.
  * Therefore this function should only be used when the pool is not changing
  * (e.g. in syncing context), or the callback can deal with the possible races.
  */
 static int
 dmu_objset_find_impl(spa_t *spa, const char *name,
     int func(const char *, void *), void *arg, int flags)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	char *child;
 	uint64_t thisobj;
 	int err;
 
 	dsl_pool_config_enter(dp, FTAG);
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
 	if (err != 0) {
 		dsl_pool_config_exit(dp, FTAG);
 		return (err);
 	}
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 		return (0);
 	}
 
-	thisobj = dd->dd_phys->dd_head_dataset_obj;
+	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj);
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
 			child = kmem_asprintf("%s/%s", name, attr->za_name);
 			dsl_pool_config_exit(dp, FTAG);
 			err = dmu_objset_find_impl(spa, child,
 			    func, arg, flags);
 			dsl_pool_config_enter(dp, FTAG);
 			strfree(child);
 			if (err != 0)
 				break;
 		}
 		zap_cursor_fini(&zc);
 
 		if (err != 0) {
 			dsl_dir_rele(dd, FTAG);
 			dsl_pool_config_exit(dp, FTAG);
 			kmem_free(attr, sizeof (zap_attribute_t));
 			return (err);
 		}
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
 	if (flags & DS_FIND_SNAPSHOTS) {
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
-			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+			uint64_t snapobj;
+
+			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
 				ASSERT3U(attr->za_num_integers, ==, 1);
 
 				child = kmem_asprintf("%s@%s",
 				    name, attr->za_name);
 				dsl_pool_config_exit(dp, FTAG);
 				err = func(child, arg);
 				dsl_pool_config_enter(dp, FTAG);
 				strfree(child);
 				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
 	dsl_dir_rele(dd, FTAG);
 	kmem_free(attr, sizeof (zap_attribute_t));
 	dsl_pool_config_exit(dp, FTAG);
 
 	if (err != 0)
 		return (err);
 
 	/* Apply to self. */
 	return (func(name, arg));
 }
 
 /*
  * See comment above dmu_objset_find_impl().
  */
 int
 dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	error = dmu_objset_find_impl(spa, name, func, arg, flags);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 void
 dmu_objset_set_user(objset_t *os, void *user_ptr)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	os->os_user_ptr = user_ptr;
 }
 
 void *
 dmu_objset_get_user(objset_t *os)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	return (os->os_user_ptr);
 }
 
 /*
  * Determine name of filesystem, given name of snapshot.
  * buf must be at least MAXNAMELEN bytes
  */
 int
 dmu_fsname(const char *snapname, char *buf)
 {
 	char *atp = strchr(snapname, '@');
 	if (atp == NULL)
 		return (SET_ERROR(EINVAL));
 	if (atp - snapname >= MAXNAMELEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	(void) strlcpy(buf, snapname, atp - snapname + 1);
 	return (0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	(revision 275782)
@@ -1,2215 +1,2225 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
 #include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/zfeature.h>
 
 #ifdef __FreeBSD__
 #undef dump_write
 #define dump_write dmu_dump_write
 #endif
 
 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
 int zfs_send_corrupt_data = B_FALSE;
 
 static char *dmu_recv_tag = "dmu_recv_tag";
 static const char *recv_clone_name = "%recv";
 
 static int
 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
 {
 	dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
 	struct uio auio;
 	struct iovec aiov;
 	ASSERT0(len % 8);
 
 	fletcher_4_incremental_native(buf, len, &dsp->dsa_zc);
 	aiov.iov_base = buf;
 	aiov.iov_len = len;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = len;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_offset = (off_t)-1;
 	auio.uio_td = dsp->dsa_td;
 #ifdef _KERNEL
 	if (dsp->dsa_fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 	dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0,
 	    dsp->dsa_td);
 #else
 	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
 	dsp->dsa_err = EOPNOTSUPP;
 #endif
 	mutex_enter(&ds->ds_sendstream_lock);
 	*dsp->dsa_off += len;
 	mutex_exit(&ds->ds_sendstream_lock);
 
 	return (dsp->dsa_err);
 }
 
 static int
 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
     uint64_t length)
 {
 	struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
 
 	/*
 	 * When we receive a free record, dbuf_free_range() assumes
 	 * that the receiving system doesn't have any dbufs in the range
 	 * being freed.  This is always true because there is a one-record
 	 * constraint: we only send one WRITE record for any given
 	 * object+offset.  We know that the one-record constraint is
 	 * true because we always send data in increasing order by
 	 * object,offset.
 	 *
 	 * If the increasing-order constraint ever changes, we should find
 	 * another way to assert that the one-record constraint is still
 	 * satisfied.
 	 */
 	ASSERT(object > dsp->dsa_last_data_object ||
 	    (object == dsp->dsa_last_data_object &&
 	    offset > dsp->dsa_last_data_offset));
 
 	/*
 	 * If we are doing a non-incremental send, then there can't
 	 * be any data in the dataset we're receiving into.  Therefore
 	 * a free record would simply be a no-op.  Save space by not
 	 * sending it to begin with.
 	 */
 	if (!dsp->dsa_incremental)
 		return (0);
 
 	if (length != -1ULL && offset + length < offset)
 		length = -1ULL;
 
 	/*
 	 * If there is a pending op, but it's not PENDING_FREE, push it out,
 	 * since free block aggregation can only be done for blocks of the
 	 * same type (i.e., DRR_FREE records can only be aggregated with
 	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
 	 * aggregated with other DRR_FREEOBJECTS records.
 	 */
 	if (dsp->dsa_pending_op != PENDING_NONE &&
 	    dsp->dsa_pending_op != PENDING_FREE) {
 		if (dump_bytes(dsp, dsp->dsa_drr,
 		    sizeof (dmu_replay_record_t)) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
 	if (dsp->dsa_pending_op == PENDING_FREE) {
 		/*
 		 * There should never be a PENDING_FREE if length is -1
 		 * (because dump_dnode is the only place where this
 		 * function is called with a -1, and only after flushing
 		 * any pending record).
 		 */
 		ASSERT(length != -1ULL);
 		/*
 		 * Check to see whether this free block can be aggregated
 		 * with pending one.
 		 */
 		if (drrf->drr_object == object && drrf->drr_offset +
 		    drrf->drr_length == offset) {
 			drrf->drr_length += length;
 			return (0);
 		} else {
 			/* not a continuation.  Push out pending record */
 			if (dump_bytes(dsp, dsp->dsa_drr,
 			    sizeof (dmu_replay_record_t)) != 0)
 				return (SET_ERROR(EINTR));
 			dsp->dsa_pending_op = PENDING_NONE;
 		}
 	}
 	/* create a FREE record and make it pending */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_FREE;
 	drrf->drr_object = object;
 	drrf->drr_offset = offset;
 	drrf->drr_length = length;
 	drrf->drr_toguid = dsp->dsa_toguid;
 	if (length == -1ULL) {
 		if (dump_bytes(dsp, dsp->dsa_drr,
 		    sizeof (dmu_replay_record_t)) != 0)
 			return (SET_ERROR(EINTR));
 	} else {
 		dsp->dsa_pending_op = PENDING_FREE;
 	}
 
 	return (0);
 }
 
 static int
 dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
     uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 {
 	struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
 
 	/*
 	 * We send data in increasing object, offset order.
 	 * See comment in dump_free() for details.
 	 */
 	ASSERT(object > dsp->dsa_last_data_object ||
 	    (object == dsp->dsa_last_data_object &&
 	    offset > dsp->dsa_last_data_offset));
 	dsp->dsa_last_data_object = object;
 	dsp->dsa_last_data_offset = offset + blksz - 1;
 
 	/*
 	 * If there is any kind of pending aggregation (currently either
 	 * a grouping of free objects or free blocks), push it out to
 	 * the stream, since aggregation can't be done across operations
 	 * of different types.
 	 */
 	if (dsp->dsa_pending_op != PENDING_NONE) {
 		if (dump_bytes(dsp, dsp->dsa_drr,
 		    sizeof (dmu_replay_record_t)) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 	/* write a DATA record */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_WRITE;
 	drrw->drr_object = object;
 	drrw->drr_type = type;
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = dsp->dsa_toguid;
 	if (bp == NULL || BP_IS_EMBEDDED(bp)) {
 		/*
 		 * There's no pre-computed checksum for partial-block
 		 * writes or embedded BP's, so (like
 		 * fletcher4-checkummed blocks) userland will have to
 		 * compute a dedup-capable checksum itself.
 		 */
 		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
 	} else {
 		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
 		if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
 			drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
 		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
 		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
 		DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
 		drrw->drr_key.ddk_cksum = bp->blk_cksum;
 	}
 
 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 		return (SET_ERROR(EINTR));
 	if (dump_bytes(dsp, data, blksz) != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static int
 dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
     int blksz, const blkptr_t *bp)
 {
 	char buf[BPE_PAYLOAD_SIZE];
 	struct drr_write_embedded *drrw =
 	    &(dsp->dsa_drr->drr_u.drr_write_embedded);
 
 	if (dsp->dsa_pending_op != PENDING_NONE) {
 		if (dump_bytes(dsp, dsp->dsa_drr,
 		    sizeof (dmu_replay_record_t)) != 0)
 			return (EINTR);
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
 	ASSERT(BP_IS_EMBEDDED(bp));
 
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
 	drrw->drr_object = object;
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = dsp->dsa_toguid;
 	drrw->drr_compression = BP_GET_COMPRESS(bp);
 	drrw->drr_etype = BPE_GET_ETYPE(bp);
 	drrw->drr_lsize = BPE_GET_LSIZE(bp);
 	drrw->drr_psize = BPE_GET_PSIZE(bp);
 
 	decode_embedded_bp_compressed(bp, buf);
 
 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 		return (EINTR);
 	if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
 		return (EINTR);
 	return (0);
 }
 
 static int
 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
 {
 	struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
 
 	if (dsp->dsa_pending_op != PENDING_NONE) {
 		if (dump_bytes(dsp, dsp->dsa_drr,
 		    sizeof (dmu_replay_record_t)) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
 	/* write a SPILL record */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_SPILL;
 	drrs->drr_object = object;
 	drrs->drr_length = blksz;
 	drrs->drr_toguid = dsp->dsa_toguid;
 
 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
 		return (SET_ERROR(EINTR));
 	if (dump_bytes(dsp, data, blksz))
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static int
 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
 {
 	struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
 
 	/* See comment in dump_free(). */
 	if (!dsp->dsa_incremental)
 		return (0);
 
 	/*
 	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
 	 * push it out, since free block aggregation can only be done for
 	 * blocks of the same type (i.e., DRR_FREE records can only be
 	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
 	 * can only be aggregated with other DRR_FREEOBJECTS records.
 	 */
 	if (dsp->dsa_pending_op != PENDING_NONE &&
 	    dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
 		if (dump_bytes(dsp, dsp->dsa_drr,
 		    sizeof (dmu_replay_record_t)) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 	if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
 		/*
 		 * See whether this free object array can be aggregated
 		 * with pending one
 		 */
 		if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
 			drrfo->drr_numobjs += numobjs;
 			return (0);
 		} else {
 			/* can't be aggregated.  Push out pending record */
 			if (dump_bytes(dsp, dsp->dsa_drr,
 			    sizeof (dmu_replay_record_t)) != 0)
 				return (SET_ERROR(EINTR));
 			dsp->dsa_pending_op = PENDING_NONE;
 		}
 	}
 
 	/* write a FREEOBJECTS record */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
 	drrfo->drr_firstobj = firstobj;
 	drrfo->drr_numobjs = numobjs;
 	drrfo->drr_toguid = dsp->dsa_toguid;
 
 	dsp->dsa_pending_op = PENDING_FREEOBJECTS;
 
 	return (0);
 }
 
 static int
 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
 {
 	struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
 
 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 		return (dump_freeobjects(dsp, object, 1));
 
 	if (dsp->dsa_pending_op != PENDING_NONE) {
 		if (dump_bytes(dsp, dsp->dsa_drr,
 		    sizeof (dmu_replay_record_t)) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
 	/* write an OBJECT record */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_OBJECT;
 	drro->drr_object = object;
 	drro->drr_type = dnp->dn_type;
 	drro->drr_bonustype = dnp->dn_bonustype;
 	drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	drro->drr_bonuslen = dnp->dn_bonuslen;
 	drro->drr_checksumtype = dnp->dn_checksum;
 	drro->drr_compress = dnp->dn_compress;
 	drro->drr_toguid = dsp->dsa_toguid;
 
 	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
 		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
 
 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 		return (SET_ERROR(EINTR));
 
 	if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
 		return (SET_ERROR(EINTR));
 
 	/* Free anything past the end of the file. */
 	if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
 		return (SET_ERROR(EINTR));
 	if (dsp->dsa_err != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static boolean_t
 backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
 {
 	if (!BP_IS_EMBEDDED(bp))
 		return (B_FALSE);
 
 	/*
 	 * Compression function must be legacy, or explicitly enabled.
 	 */
 	if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
 	    !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
 		return (B_FALSE);
 
 	/*
 	 * Embed type must be explicitly enabled.
 	 */
 	switch (BPE_GET_ETYPE(bp)) {
 	case BP_EMBEDDED_TYPE_DATA:
 		if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
 			return (B_TRUE);
 		break;
 	default:
 		return (B_FALSE);
 	}
 	return (B_FALSE);
 }
 
 #define	BP_SPAN(dnp, level) \
 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
 
 /* ARGSUSED */
 static int
 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	dmu_sendarg_t *dsp = arg;
 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
 	int err = 0;
 
 	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (SET_ERROR(EINTR));
 
 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 		return (0);
 	} else if (zb->zb_level == ZB_ZIL_LEVEL) {
 		/*
 		 * If we are sending a non-snapshot (which is allowed on
 		 * read-only pools), it may have a ZIL, which must be ignored.
 		 */
 		return (0);
 	} else if (BP_IS_HOLE(bp) &&
 	    zb->zb_object == DMU_META_DNODE_OBJECT) {
 		uint64_t span = BP_SPAN(dnp, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 		err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
 	} else if (BP_IS_HOLE(bp)) {
 		uint64_t span = BP_SPAN(dnp, zb->zb_level);
 		err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
 	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 		return (0);
 	} else if (type == DMU_OT_DNODE) {
 		dnode_phys_t *blk;
 		int i;
 		int blksz = BP_GET_LSIZE(bp);
 		uint32_t aflags = ARC_WAIT;
 		arc_buf_t *abuf;
 
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
 		blk = abuf->b_data;
 		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
 			uint64_t dnobj = (zb->zb_blkid <<
 			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
 			err = dump_dnode(dsp, dnobj, blk+i);
 			if (err != 0)
 				break;
 		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	} else if (type == DMU_OT_SA) {
 		uint32_t aflags = ARC_WAIT;
 		arc_buf_t *abuf;
 		int blksz = BP_GET_LSIZE(bp);
 
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
 		err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	} else if (backup_do_embed(dsp, bp)) {
 		/* it's an embedded level-0 block of a regular object */
 		int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 		err = dump_write_embedded(dsp, zb->zb_object,
 		    zb->zb_blkid * blksz, blksz, bp);
 	} else { /* it's a level-0 block of a regular object */
 		uint32_t aflags = ARC_WAIT;
 		arc_buf_t *abuf;
 		int blksz = BP_GET_LSIZE(bp);
 		uint64_t offset;
 
 		ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 		ASSERT0(zb->zb_level);
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0) {
 			if (zfs_send_corrupt_data) {
 				/* Send a block filled with 0x"zfs badd bloc" */
 				abuf = arc_buf_alloc(spa, blksz, &abuf,
 				    ARC_BUFC_DATA);
 				uint64_t *ptr;
 				for (ptr = abuf->b_data;
 				    (char *)ptr < (char *)abuf->b_data + blksz;
 				    ptr++)
 					*ptr = 0x2f5baddb10c;
 			} else {
 				return (SET_ERROR(EIO));
 			}
 		}
 
 		offset = zb->zb_blkid * blksz;
 
 		if (!(dsp->dsa_featureflags &
 		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 		    blksz > SPA_OLD_MAXBLOCKSIZE) {
 			char *buf = abuf->b_data;
 			while (blksz > 0 && err == 0) {
 				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
 				err = dump_write(dsp, type, zb->zb_object,
 				    offset, n, NULL, buf);
 				offset += n;
 				buf += n;
 				blksz -= n;
 			}
 		} else {
 			err = dump_write(dsp, type, zb->zb_object,
 			    offset, blksz, bp, abuf->b_data);
 		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	}
 
 	ASSERT(err == 0 || err == EINTR);
 	return (err);
 }
 
 /*
  * Releases dp using the specified tag.
  */
 static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
     zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
 #ifdef illumos
     boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
 #else
     boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off)
 #endif
 {
 	objset_t *os;
 	dmu_replay_record_t *drr;
 	dmu_sendarg_t *dsp;
 	int err;
 	uint64_t fromtxg = 0;
 	uint64_t featureflags = 0;
 
 	err = dmu_objset_from_ds(ds, &os);
 	if (err != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
 	}
 
 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 	drr->drr_type = DRR_BEGIN;
 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
 	DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
 	    DMU_SUBSTREAM);
 
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS) {
 		uint64_t version;
 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
 			kmem_free(drr, sizeof (dmu_replay_record_t));
 			dsl_pool_rele(dp, tag);
 			return (SET_ERROR(EINVAL));
 		}
 		if (version >= ZPL_VERSION_SA) {
 			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
 		}
 	}
 #endif
 
 	if (large_block_ok && ds->ds_large_blocks)
 		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
 	if (embedok &&
 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
 		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
 		if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 			featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
 	} else {
 		embedok = B_FALSE;
 	}
 
 	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
 	    featureflags);
 
 	drr->drr_u.drr_begin.drr_creation_time =
-	    ds->ds_phys->ds_creation_time;
+	    dsl_dataset_phys(ds)->ds_creation_time;
 	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
 	if (is_clone)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
-	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid;
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
 
 	if (fromzb != NULL) {
 		drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid;
 		fromtxg = fromzb->zbm_creation_txg;
 	}
 	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
 	if (!dsl_dataset_is_snapshot(ds)) {
 		(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
 		    sizeof (drr->drr_u.drr_begin.drr_toname));
 	}
 
 	dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
 
 	dsp->dsa_drr = drr;
 	dsp->dsa_outfd = outfd;
 	dsp->dsa_proc = curproc;
 	dsp->dsa_td = curthread;
 	dsp->dsa_fp = fp;
 	dsp->dsa_os = os;
 	dsp->dsa_off = off;
-	dsp->dsa_toguid = ds->ds_phys->ds_guid;
+	dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid;
 	ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
 	dsp->dsa_pending_op = PENDING_NONE;
 	dsp->dsa_incremental = (fromzb != NULL);
 	dsp->dsa_featureflags = featureflags;
 
 	mutex_enter(&ds->ds_sendstream_lock);
 	list_insert_head(&ds->ds_sendstreams, dsp);
 	mutex_exit(&ds->ds_sendstream_lock);
 
 	dsl_dataset_long_hold(ds, FTAG);
 	dsl_pool_rele(dp, tag);
 
 	if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
 		err = dsp->dsa_err;
 		goto out;
 	}
 
 	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
 	    backup_cb, dsp);
 
 	if (dsp->dsa_pending_op != PENDING_NONE)
 		if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
 			err = SET_ERROR(EINTR);
 
 	if (err != 0) {
 		if (err == EINTR && dsp->dsa_err != 0)
 			err = dsp->dsa_err;
 		goto out;
 	}
 
 	bzero(drr, sizeof (dmu_replay_record_t));
 	drr->drr_type = DRR_END;
 	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
 	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 
 	if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
 		err = dsp->dsa_err;
 		goto out;
 	}
 
 out:
 	mutex_enter(&ds->ds_sendstream_lock);
 	list_remove(&ds->ds_sendstreams, dsp);
 	mutex_exit(&ds->ds_sendstream_lock);
 
 	kmem_free(drr, sizeof (dmu_replay_record_t));
 	kmem_free(dsp, sizeof (dmu_sendarg_t));
 
 	dsl_dataset_long_rele(ds, FTAG);
 
 	return (err);
 }
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
     boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, vnode_t *vp, offset_t *off)
 #else
     int outfd, struct file *fp, offset_t *off)
 #endif
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	dsl_dataset_t *fromds = NULL;
 	int err;
 
 	err = dsl_pool_hold(pool, FTAG, &dp);
 	if (err != 0)
 		return (err);
 
 	err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
 	if (fromsnap != 0) {
 		zfs_bookmark_phys_t zb;
 		boolean_t is_clone;
 
 		err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
 		if (err != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (err);
 		}
 		if (!dsl_dataset_is_before(ds, fromds, 0))
 			err = SET_ERROR(EXDEV);
-		zb.zbm_creation_time = fromds->ds_phys->ds_creation_time;
-		zb.zbm_creation_txg = fromds->ds_phys->ds_creation_txg;
-		zb.zbm_guid = fromds->ds_phys->ds_guid;
+		zb.zbm_creation_time =
+		    dsl_dataset_phys(fromds)->ds_creation_time;
+		zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
+		zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
 		is_clone = (fromds->ds_dir != ds->ds_dir);
 		dsl_dataset_rele(fromds, FTAG);
 		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
 		    embedok, large_block_ok, outfd, fp, off);
 	} else {
 		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
 		    embedok, large_block_ok, outfd, fp, off);
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 int
 dmu_send(const char *tosnap, const char *fromsnap,
     boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, vnode_t *vp, offset_t *off)
 #else
     int outfd, struct file *fp, offset_t *off)
 #endif
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 	boolean_t owned = B_FALSE;
 
 	if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
 		return (SET_ERROR(EINVAL));
 
 	err = dsl_pool_hold(tosnap, FTAG, &dp);
 	if (err != 0)
 		return (err);
 
 	if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
 		/*
 		 * We are sending a filesystem or volume.  Ensure
 		 * that it doesn't change by owning the dataset.
 		 */
 		err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
 		owned = B_TRUE;
 	} else {
 		err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
 	}
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
 	if (fromsnap != NULL) {
 		zfs_bookmark_phys_t zb;
 		boolean_t is_clone = B_FALSE;
 		int fsnamelen = strchr(tosnap, '@') - tosnap;
 
 		/*
 		 * If the fromsnap is in a different filesystem, then
 		 * mark the send stream as a clone.
 		 */
 		if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
 		    (fromsnap[fsnamelen] != '@' &&
 		    fromsnap[fsnamelen] != '#')) {
 			is_clone = B_TRUE;
 		}
 
 		if (strchr(fromsnap, '@')) {
 			dsl_dataset_t *fromds;
 			err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
 			if (err == 0) {
 				if (!dsl_dataset_is_before(ds, fromds, 0))
 					err = SET_ERROR(EXDEV);
 				zb.zbm_creation_time =
-				    fromds->ds_phys->ds_creation_time;
+				    dsl_dataset_phys(fromds)->ds_creation_time;
 				zb.zbm_creation_txg =
-				    fromds->ds_phys->ds_creation_txg;
-				zb.zbm_guid = fromds->ds_phys->ds_guid;
+				    dsl_dataset_phys(fromds)->ds_creation_txg;
+				zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
 				is_clone = (ds->ds_dir != fromds->ds_dir);
 				dsl_dataset_rele(fromds, FTAG);
 			}
 		} else {
 			err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
 		}
 		if (err != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (err);
 		}
 		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
 		    embedok, large_block_ok, outfd, fp, off);
 	} else {
 		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
 		    embedok, large_block_ok, outfd, fp, off);
 	}
 	if (owned)
 		dsl_dataset_disown(ds, FTAG);
 	else
 		dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 int
 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	int err;
 	uint64_t size;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	/* tosnap must be a snapshot */
 	if (!dsl_dataset_is_snapshot(ds))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
 	 * or the origin's fs.
 	 */
 	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
 		return (SET_ERROR(EXDEV));
 
 	/* Get uncompressed size estimate of changed data. */
 	if (fromds == NULL) {
-		size = ds->ds_phys->ds_uncompressed_bytes;
+		size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
 	} else {
 		uint64_t used, comp;
 		err = dsl_dataset_space_written(fromds, ds,
 		    &used, &comp, &size);
 		if (err != 0)
 			return (err);
 	}
 
 	/*
 	 * Assume that space (both on-disk and in-stream) is dominated by
 	 * data.  We will adjust for indirect blocks and the copies property,
 	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
 	 */
 
 	/*
 	 * Subtract out approximate space used by indirect blocks.
 	 * Assume most space is used by data blocks (non-indirect, non-dnode).
 	 * Assume all blocks are recordsize.  Assume ditto blocks and
 	 * internal fragmentation counter out compression.
 	 *
 	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
 	 * block, which we observe in practice.
 	 */
 	uint64_t recordsize;
 	err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
 	if (err != 0)
 		return (err);
 	size -= size / recordsize * sizeof (blkptr_t);
 
 	/* Add in the space for the record associated with each block. */
 	size += size / recordsize * sizeof (dmu_replay_record_t);
 
 	*sizep = size;
 
 	return (0);
 }
 
 typedef struct dmu_recv_begin_arg {
 	const char *drba_origin;
 	dmu_recv_cookie_t *drba_cookie;
 	cred_t *drba_cred;
 	uint64_t drba_snapobj;
 } dmu_recv_begin_arg_t;
 
 static int
 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
     uint64_t fromguid)
 {
 	uint64_t val;
 	int error;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/* temporary clone name must not exist */
 	error = zap_lookup(dp->dp_meta_objset,
-	    ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name,
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
 	    8, 1, &val);
 	if (error != ENOENT)
 		return (error == 0 ? EBUSY : error);
 
 	/* new snapshot name must not exist */
 	error = zap_lookup(dp->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap,
-	    8, 1, &val);
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+	    drba->drba_cookie->drc_tosnap, 8, 1, &val);
 	if (error != ENOENT)
 		return (error == 0 ? EEXIST : error);
 
 	/*
 	 * Check snapshot limit before receiving. We'll recheck again at the
 	 * end, but might as well abort before receiving if we're already over
 	 * the limit.
 	 *
 	 * Note that we do not check the file system limit with
 	 * dsl_dir_fscount_check because the temporary %clones don't count
 	 * against that limit.
 	 */
 	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
 	    NULL, drba->drba_cred);
 	if (error != 0)
 		return (error);
 
 	if (fromguid != 0) {
 		dsl_dataset_t *snap;
-		uint64_t obj = ds->ds_phys->ds_prev_snap_obj;
+		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 
 		/* Find snapshot in this dir that matches fromguid. */
 		while (obj != 0) {
 			error = dsl_dataset_hold_obj(dp, obj, FTAG,
 			    &snap);
 			if (error != 0)
 				return (SET_ERROR(ENODEV));
 			if (snap->ds_dir != ds->ds_dir) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
-			if (snap->ds_phys->ds_guid == fromguid)
+			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
 				break;
-			obj = snap->ds_phys->ds_prev_snap_obj;
+			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 			dsl_dataset_rele(snap, FTAG);
 		}
 		if (obj == 0)
 			return (SET_ERROR(ENODEV));
 
 		if (drba->drba_cookie->drc_force) {
 			drba->drba_snapobj = obj;
 		} else {
 			/*
 			 * If we are not forcing, there must be no
 			 * changes since fromsnap.
 			 */
 			if (dsl_dataset_modified_since_snap(ds, snap)) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ETXTBSY));
 			}
 			drba->drba_snapobj = ds->ds_prev->ds_object;
 		}
 
 		dsl_dataset_rele(snap, FTAG);
 	} else {
 		/* if full, most recent snapshot must be $ORIGIN */
-		if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
+		if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= TXG_INITIAL)
 			return (SET_ERROR(ENODEV));
-		drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj;
+		drba->drba_snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	}
 
 	return (0);
 
 }
 
 static int
 dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	uint64_t fromguid = drrb->drr_fromguid;
 	int flags = drrb->drr_flags;
 	int error;
 	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	dsl_dataset_t *ds;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES ||
 	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
 		return (SET_ERROR(EINVAL));
 
 	/* Verify pool version supports SA if SA_SPILL feature set */
 	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
 	 * record to a plan WRITE record, so the pool must have the
 	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
 	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * The receiving code doesn't know how to translate large blocks
 	 * to smaller ones, so the pool must have the LARGE_BLOCKS
 	 * feature enabled if the stream has LARGE_BLOCKS.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
 		return (SET_ERROR(ENOTSUP));
 
 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
 
 		/* Can't recv a clone into an existing fs */
 		if (flags & DRR_FLAG_CLONE) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = recv_begin_check_existing_impl(drba, ds, fromguid);
 		dsl_dataset_rele(ds, FTAG);
 	} else if (error == ENOENT) {
 		/* target fs does not exist; must be a full backup or clone */
 		char buf[MAXNAMELEN];
 
 		/*
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
 		 */
 		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
 			return (SET_ERROR(ENOENT));
 
 		/* Open the parent of tofs */
 		ASSERT3U(strlen(tofs), <, MAXNAMELEN);
 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
 		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Check filesystem and snapshot limits before receiving. We'll
 		 * recheck snapshot limits again at the end (we create the
 		 * filesystems and increment those counts during begin_sync).
 		 */
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		if (drba->drba_origin != NULL) {
 			dsl_dataset_t *origin;
 			error = dsl_dataset_hold(dp, drba->drba_origin,
 			    FTAG, &origin);
 			if (error != 0) {
 				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 			if (!dsl_dataset_is_snapshot(origin)) {
 				dsl_dataset_rele(origin, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
-			if (origin->ds_phys->ds_guid != fromguid) {
+			if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
 				dsl_dataset_rele(origin, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 			dsl_dataset_rele(origin, FTAG);
 		}
 		dsl_dataset_rele(ds, FTAG);
 		error = 0;
 	}
 	return (error);
 }
 
 static void
 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 	dsl_dataset_t *ds, *newds;
 	uint64_t dsobj;
 	int error;
 	uint64_t crflags;
 
 	crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
 	    DS_FLAG_CI_DATASET : 0;
 
 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
 		/* create temporary clone */
 		dsl_dataset_t *snap = NULL;
 		if (drba->drba_snapobj != 0) {
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    drba->drba_snapobj, FTAG, &snap));
 		}
 		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
 		    snap, crflags, drba->drba_cred, tx);
 		dsl_dataset_rele(snap, FTAG);
 		dsl_dataset_rele(ds, FTAG);
 	} else {
 		dsl_dir_t *dd;
 		const char *tail;
 		dsl_dataset_t *origin = NULL;
 
 		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
 
 		if (drba->drba_origin != NULL) {
 			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
 			    FTAG, &origin));
 		}
 
 		/* Create new dataset. */
 		dsobj = dsl_dataset_create_sync(dd,
 		    strrchr(tofs, '/') + 1,
 		    origin, crflags, drba->drba_cred, tx);
 		if (origin != NULL)
 			dsl_dataset_rele(origin, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		drba->drba_cookie->drc_newfs = B_TRUE;
 	}
 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
 	if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
 	    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 	    !newds->ds_large_blocks) {
 		dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
 		newds->ds_large_blocks = B_TRUE;
 	}
 
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
-	newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	/*
 	 * If we actually created a non-clone, we need to create the
 	 * objset in our new dataset.
 	 */
 	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
 		(void) dmu_objset_create_impl(dp->dp_spa,
 		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
 	}
 
 	drba->drba_cookie->drc_ds = newds;
 
 	spa_history_log_internal_ds(newds, "receive", tx, "");
 }
 
 /*
  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
     boolean_t force, char *origin, dmu_recv_cookie_t *drc)
 {
 	dmu_recv_begin_arg_t drba = { 0 };
 	dmu_replay_record_t *drr;
 
 	bzero(drc, sizeof (dmu_recv_cookie_t));
 	drc->drc_drrb = drrb;
 	drc->drc_tosnap = tosnap;
 	drc->drc_tofs = tofs;
 	drc->drc_force = force;
 	drc->drc_cred = CRED();
 
 	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
 		drc->drc_byteswap = B_TRUE;
 	else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
 		return (SET_ERROR(EINVAL));
 
 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 	drr->drr_type = DRR_BEGIN;
 	drr->drr_u.drr_begin = *drc->drc_drrb;
 	if (drc->drc_byteswap) {
 		fletcher_4_incremental_byteswap(drr,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 	} else {
 		fletcher_4_incremental_native(drr,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 	}
 	kmem_free(drr, sizeof (dmu_replay_record_t));
 
 	if (drc->drc_byteswap) {
 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
 		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
 		drrb->drr_type = BSWAP_32(drrb->drr_type);
 		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
 		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
 	}
 
 	drba.drba_origin = origin;
 	drba.drba_cookie = drc;
 	drba.drba_cred = CRED();
 
 	return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
 	    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 struct restorearg {
 	int err;
 	boolean_t byteswap;
 	kthread_t *td;
 	struct file *fp;
 	char *buf;
 	uint64_t voff;
 	int bufsize; /* amount of memory allocated for buf */
 	zio_cksum_t cksum;
 	avl_tree_t *guid_to_ds_map;
 };
 
 typedef struct guid_map_entry {
 	uint64_t	guid;
 	dsl_dataset_t	*gme_ds;
 	avl_node_t	avlnode;
 } guid_map_entry_t;
 
 static int
 guid_compare(const void *arg1, const void *arg2)
 {
 	const guid_map_entry_t *gmep1 = arg1;
 	const guid_map_entry_t *gmep2 = arg2;
 
 	if (gmep1->guid < gmep2->guid)
 		return (-1);
 	else if (gmep1->guid > gmep2->guid)
 		return (1);
 	return (0);
 }
 
 static void
 free_guid_map_onexit(void *arg)
 {
 	avl_tree_t *ca = arg;
 	void *cookie = NULL;
 	guid_map_entry_t *gmep;
 
 	while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
 		dsl_dataset_long_rele(gmep->gme_ds, gmep);
 		dsl_dataset_rele(gmep->gme_ds, gmep);
 		kmem_free(gmep, sizeof (guid_map_entry_t));
 	}
 	avl_destroy(ca);
 	kmem_free(ca, sizeof (avl_tree_t));
 }
 
 static int
 restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	aiov.iov_base = buf;
 	aiov.iov_len = len;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = len;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = off;
 	auio.uio_td = ra->td;
 #ifdef _KERNEL
 	error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
 #else
 	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
 	error = EOPNOTSUPP;
 #endif
 	*resid = auio.uio_resid;
 	return (error);
 }
 
 static void *
 restore_read(struct restorearg *ra, int len, char *buf)
 {
 	int done = 0;
 
 	if (buf == NULL)
 		buf = ra->buf;
 
 	/* some things will require 8-byte alignment, so everything must */
 	ASSERT0(len % 8);
 	ASSERT3U(len, <=, ra->bufsize);
 
 	while (done < len) {
 		ssize_t resid;
 
 		ra->err = restore_bytes(ra, buf + done,
 		    len - done, ra->voff, &resid);
 
 		if (resid == len - done)
 			ra->err = SET_ERROR(EINVAL);
 		ra->voff += len - done - resid;
 		done = len - resid;
 		if (ra->err != 0)
 			return (NULL);
 	}
 
 	ASSERT3U(done, ==, len);
 	if (ra->byteswap)
 		fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
 	else
 		fletcher_4_incremental_native(buf, len, &ra->cksum);
 	return (buf);
 }
 
 static void
 backup_byteswap(dmu_replay_record_t *drr)
 {
 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 	drr->drr_type = BSWAP_32(drr->drr_type);
 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
 	switch (drr->drr_type) {
 	case DRR_BEGIN:
 		DO64(drr_begin.drr_magic);
 		DO64(drr_begin.drr_versioninfo);
 		DO64(drr_begin.drr_creation_time);
 		DO32(drr_begin.drr_type);
 		DO32(drr_begin.drr_flags);
 		DO64(drr_begin.drr_toguid);
 		DO64(drr_begin.drr_fromguid);
 		break;
 	case DRR_OBJECT:
 		DO64(drr_object.drr_object);
 		DO32(drr_object.drr_type);
 		DO32(drr_object.drr_bonustype);
 		DO32(drr_object.drr_blksz);
 		DO32(drr_object.drr_bonuslen);
 		DO64(drr_object.drr_toguid);
 		break;
 	case DRR_FREEOBJECTS:
 		DO64(drr_freeobjects.drr_firstobj);
 		DO64(drr_freeobjects.drr_numobjs);
 		DO64(drr_freeobjects.drr_toguid);
 		break;
 	case DRR_WRITE:
 		DO64(drr_write.drr_object);
 		DO32(drr_write.drr_type);
 		DO64(drr_write.drr_offset);
 		DO64(drr_write.drr_length);
 		DO64(drr_write.drr_toguid);
 		DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
 		DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
 		DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
 		DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
 		DO64(drr_write.drr_key.ddk_prop);
 		break;
 	case DRR_WRITE_BYREF:
 		DO64(drr_write_byref.drr_object);
 		DO64(drr_write_byref.drr_offset);
 		DO64(drr_write_byref.drr_length);
 		DO64(drr_write_byref.drr_toguid);
 		DO64(drr_write_byref.drr_refguid);
 		DO64(drr_write_byref.drr_refobject);
 		DO64(drr_write_byref.drr_refoffset);
 		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
 		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
 		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
 		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
 		DO64(drr_write_byref.drr_key.ddk_prop);
 		break;
 	case DRR_WRITE_EMBEDDED:
 		DO64(drr_write_embedded.drr_object);
 		DO64(drr_write_embedded.drr_offset);
 		DO64(drr_write_embedded.drr_length);
 		DO64(drr_write_embedded.drr_toguid);
 		DO32(drr_write_embedded.drr_lsize);
 		DO32(drr_write_embedded.drr_psize);
 		break;
 	case DRR_FREE:
 		DO64(drr_free.drr_object);
 		DO64(drr_free.drr_offset);
 		DO64(drr_free.drr_length);
 		DO64(drr_free.drr_toguid);
 		break;
 	case DRR_SPILL:
 		DO64(drr_spill.drr_object);
 		DO64(drr_spill.drr_length);
 		DO64(drr_spill.drr_toguid);
 		break;
 	case DRR_END:
 		DO64(drr_end.drr_checksum.zc_word[0]);
 		DO64(drr_end.drr_checksum.zc_word[1]);
 		DO64(drr_end.drr_checksum.zc_word[2]);
 		DO64(drr_end.drr_checksum.zc_word[3]);
 		DO64(drr_end.drr_toguid);
 		break;
 	}
 #undef DO64
 #undef DO32
 }
 
 static inline uint8_t
 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
 {
 	if (bonus_type == DMU_OT_SA) {
 		return (1);
 	} else {
 		return (1 +
 		    ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
 	}
 }
 
 static int
 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 {
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
 	void *data = NULL;
 	uint64_t object;
 	int err;
 
 	if (drro->drr_type == DMU_OT_NONE ||
 	    !DMU_OT_IS_VALID(drro->drr_type) ||
 	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
 	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
 	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	err = dmu_object_info(os, drro->drr_object, &doi);
 
 	if (err != 0 && err != ENOENT)
 		return (SET_ERROR(EINVAL));
 	object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
 
 	if (drro->drr_bonuslen) {
 		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL);
 		if (ra->err != 0)
 			return (ra->err);
 	}
 
 	/*
 	 * If we are losing blkptrs or changing the block size this must
 	 * be a new file instance.  We must clear out the previous file
 	 * contents before we can change this type of metadata in the dnode.
 	 */
 	if (err == 0) {
 		int nblkptr;
 
 		nblkptr = deduce_nblkptr(drro->drr_bonustype,
 		    drro->drr_bonuslen);
 
 		if (drro->drr_blksz != doi.doi_data_block_size ||
 		    nblkptr < doi.doi_nblkptr) {
 			err = dmu_free_long_range(os, drro->drr_object,
 			    0, DMU_OBJECT_END);
 			if (err != 0)
 				return (SET_ERROR(EINVAL));
 		}
 	}
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, object);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	if (object == DMU_NEW_OBJECT) {
 		/* currently free, want to be allocated */
 		err = dmu_object_claim(os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
 	} else if (drro->drr_type != doi.doi_type ||
 	    drro->drr_blksz != doi.doi_data_block_size ||
 	    drro->drr_bonustype != doi.doi_bonus_type ||
 	    drro->drr_bonuslen != doi.doi_bonus_size) {
 		/* currently allocated, but with different properties */
 		err = dmu_object_reclaim(os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
 	}
 	if (err != 0) {
 		dmu_tx_commit(tx);
 		return (SET_ERROR(EINVAL));
 	}
 
 	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
 	    tx);
 	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
 
 	if (data != NULL) {
 		dmu_buf_t *db;
 
 		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		bcopy(data, db->db_data, drro->drr_bonuslen);
 		if (ra->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drro->drr_bonustype);
 			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
 			    drro->drr_bonuslen);
 		}
 		dmu_buf_rele(db, FTAG);
 	}
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 restore_freeobjects(struct restorearg *ra, objset_t *os,
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
 
 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
 		return (SET_ERROR(EINVAL));
 
 	for (obj = drrfo->drr_firstobj;
 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
 	    (void) dmu_object_next(os, &obj, FALSE, 0)) {
 		int err;
 
 		if (dmu_object_info(os, obj, NULL) != 0)
 			continue;
 
 		err = dmu_free_long_object(os, obj);
 		if (err != 0)
 			return (err);
 	}
 	return (0);
 }
 
 static int
 restore_write(struct restorearg *ra, objset_t *os,
     struct drr_write *drrw)
 {
 	dmu_tx_t *tx;
 	void *data;
 	int err;
 
 	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));
 
 	if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	dmu_buf_t *bonus;
 	if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0)
 		return (SET_ERROR(EINVAL));
 
 	arc_buf_t *abuf = dmu_request_arcbuf(bonus, drrw->drr_length);
 
 	data = restore_read(ra, drrw->drr_length, abuf->b_data);
 	if (data == NULL) {
 		dmu_return_arcbuf(abuf);
 		dmu_buf_rele(bonus, FTAG);
 		return (ra->err);
 	}
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, drrw->drr_object,
 	    drrw->drr_offset, drrw->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_return_arcbuf(abuf);
 		dmu_buf_rele(bonus, FTAG);
 		dmu_tx_abort(tx);
 		return (err);
 	}
 	if (ra->byteswap) {
 		dmu_object_byteswap_t byteswap =
 		    DMU_OT_BYTESWAP(drrw->drr_type);
 		dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
 	}
 	dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
 	dmu_tx_commit(tx);
 	dmu_buf_rele(bonus, FTAG);
 	return (0);
 }
 
 /*
  * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
  * streams to refer to a copy of the data that is already on the
  * system because it came in earlier in the stream.  This function
  * finds the earlier copy of the data, and uses that copy instead of
  * data from the stream to fulfill this write.
  */
 static int
 restore_write_byref(struct restorearg *ra, objset_t *os,
     struct drr_write_byref *drrwbr)
 {
 	dmu_tx_t *tx;
 	int err;
 	guid_map_entry_t gmesrch;
 	guid_map_entry_t *gmep;
 	avl_index_t where;
 	objset_t *ref_os = NULL;
 	dmu_buf_t *dbp;
 
 	if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If the GUID of the referenced dataset is different from the
 	 * GUID of the target dataset, find the referenced dataset.
 	 */
 	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
 		gmesrch.guid = drrwbr->drr_refguid;
 		if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
 		    &where)) == NULL) {
 			return (SET_ERROR(EINVAL));
 		}
 		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
 			return (SET_ERROR(EINVAL));
 	} else {
 		ref_os = os;
 	}
 
 	err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
 	    drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 	dmu_write(os, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
 	dmu_buf_rele(dbp, FTAG);
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
 restore_write_embedded(struct restorearg *ra, objset_t *os,
     struct drr_write_embedded *drrwnp)
 {
 	dmu_tx_t *tx;
 	int err;
 	void *data;
 
 	if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
 		return (EINVAL);
 
 	if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
 		return (EINVAL);
 
 	if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
 		return (EINVAL);
 	if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
 		return (EINVAL);
 
 	data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL);
 	if (data == NULL)
 		return (ra->err);
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, drrwnp->drr_object,
 	    drrwnp->drr_offset, drrwnp->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	dmu_write_embedded(os, drrwnp->drr_object,
 	    drrwnp->drr_offset, data, drrwnp->drr_etype,
 	    drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
 	    ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
 {
 	dmu_tx_t *tx;
 	void *data;
 	dmu_buf_t *db, *db_spill;
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
 	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
 		return (SET_ERROR(EINVAL));
 
 	data = restore_read(ra, drrs->drr_length, NULL);
 	if (data == NULL)
 		return (ra->err);
 
 	if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
 	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
 		dmu_buf_rele(db, FTAG);
 		return (err);
 	}
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_spill(tx, db->db_object);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_buf_rele(db, FTAG);
 		dmu_buf_rele(db_spill, FTAG);
 		dmu_tx_abort(tx);
 		return (err);
 	}
 	dmu_buf_will_dirty(db_spill, tx);
 
 	if (db_spill->db_size < drrs->drr_length)
 		VERIFY(0 == dbuf_spill_set_blksz(db_spill,
 		    drrs->drr_length, tx));
 	bcopy(data, db_spill->db_data, drrs->drr_length);
 
 	dmu_buf_rele(db, FTAG);
 	dmu_buf_rele(db_spill, FTAG);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 restore_free(struct restorearg *ra, objset_t *os,
     struct drr_free *drrf)
 {
 	int err;
 
 	if (drrf->drr_length != -1ULL &&
 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	err = dmu_free_long_range(os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
 	return (err);
 }
 
 /* used to destroy the drc_ds on error */
 static void
 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 {
 	char name[MAXNAMELEN];
 	dsl_dataset_name(drc->drc_ds, name);
 	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
 	(void) dsl_destroy_head(name);
 }
 
 /*
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
 dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
     int cleanup_fd, uint64_t *action_handlep)
 {
 	struct restorearg ra = { 0 };
 	dmu_replay_record_t *drr;
 	objset_t *os;
 	zio_cksum_t pcksum;
 	int featureflags;
 
 	ra.byteswap = drc->drc_byteswap;
 	ra.cksum = drc->drc_cksum;
 	ra.td = curthread;
 	ra.fp = fp;
 	ra.voff = *voffp;
 	ra.bufsize = SPA_MAXBLOCKSIZE;
 	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
 
 	/* these were verified in dmu_recv_begin */
 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
 	    DMU_SUBSTREAM);
 	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
 
 	/*
 	 * Open the objset we are modifying.
 	 */
 	VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
 
-	ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
+	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
 
 	featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
 	/* if this stream is dedup'ed, set up the avl tree for guid mapping */
 	if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
 		minor_t minor;
 
 		if (cleanup_fd == -1) {
 			ra.err = SET_ERROR(EBADF);
 			goto out;
 		}
 		ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
 		if (ra.err != 0) {
 			cleanup_fd = -1;
 			goto out;
 		}
 
 		if (*action_handlep == 0) {
 			ra.guid_to_ds_map =
 			    kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 			avl_create(ra.guid_to_ds_map, guid_compare,
 			    sizeof (guid_map_entry_t),
 			    offsetof(guid_map_entry_t, avlnode));
 			ra.err = zfs_onexit_add_cb(minor,
 			    free_guid_map_onexit, ra.guid_to_ds_map,
 			    action_handlep);
 			if (ra.err != 0)
 				goto out;
 		} else {
 			ra.err = zfs_onexit_cb_data(minor, *action_handlep,
 			    (void **)&ra.guid_to_ds_map);
 			if (ra.err != 0)
 				goto out;
 		}
 
 		drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
 	}
 
 	/*
 	 * Read records and process them.
 	 */
 	pcksum = ra.cksum;
 	while (ra.err == 0 &&
 	    NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) {
 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			ra.err = SET_ERROR(EINTR);
 			goto out;
 		}
 
 		if (ra.byteswap)
 			backup_byteswap(drr);
 
 		switch (drr->drr_type) {
 		case DRR_OBJECT:
 		{
 			/*
 			 * We need to make a copy of the record header,
 			 * because restore_{object,write} may need to
 			 * restore_read(), which will invalidate drr.
 			 */
 			struct drr_object drro = drr->drr_u.drr_object;
 			ra.err = restore_object(&ra, os, &drro);
 			break;
 		}
 		case DRR_FREEOBJECTS:
 		{
 			struct drr_freeobjects drrfo =
 			    drr->drr_u.drr_freeobjects;
 			ra.err = restore_freeobjects(&ra, os, &drrfo);
 			break;
 		}
 		case DRR_WRITE:
 		{
 			struct drr_write drrw = drr->drr_u.drr_write;
 			ra.err = restore_write(&ra, os, &drrw);
 			break;
 		}
 		case DRR_WRITE_BYREF:
 		{
 			struct drr_write_byref drrwbr =
 			    drr->drr_u.drr_write_byref;
 			ra.err = restore_write_byref(&ra, os, &drrwbr);
 			break;
 		}
 		case DRR_WRITE_EMBEDDED:
 		{
 			struct drr_write_embedded drrwe =
 			    drr->drr_u.drr_write_embedded;
 			ra.err = restore_write_embedded(&ra, os, &drrwe);
 			break;
 		}
 		case DRR_FREE:
 		{
 			struct drr_free drrf = drr->drr_u.drr_free;
 			ra.err = restore_free(&ra, os, &drrf);
 			break;
 		}
 		case DRR_END:
 		{
 			struct drr_end drre = drr->drr_u.drr_end;
 			/*
 			 * We compare against the *previous* checksum
 			 * value, because the stored checksum is of
 			 * everything before the DRR_END record.
 			 */
 			if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
 				ra.err = SET_ERROR(ECKSUM);
 			goto out;
 		}
 		case DRR_SPILL:
 		{
 			struct drr_spill drrs = drr->drr_u.drr_spill;
 			ra.err = restore_spill(&ra, os, &drrs);
 			break;
 		}
 		default:
 			ra.err = SET_ERROR(EINVAL);
 			goto out;
 		}
 		pcksum = ra.cksum;
 	}
 	ASSERT(ra.err != 0);
 
 out:
 	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
 		zfs_onexit_fd_rele(cleanup_fd);
 
 	if (ra.err != 0) {
 		/*
 		 * destroy what we created, so we don't leave it in the
 		 * inconsistent restoring state.
 		 */
 		dmu_recv_cleanup_ds(drc);
 	}
 
 	kmem_free(ra.buf, ra.bufsize);
 	*voffp = ra.voff;
 	return (ra.err);
 }
 
 static int
 dmu_recv_end_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int error;
 
 	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
 
 	if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
 		if (error != 0)
 			return (error);
 		if (drc->drc_force) {
 			/*
 			 * We will destroy any snapshots in tofs (i.e. before
 			 * origin_head) that are after the origin (which is
 			 * the snap before drc_ds, because drc_ds can not
 			 * have any snaps of its own).
 			 */
-			uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj;
-			while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) {
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				error = dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap);
 				if (error != 0)
 					return (error);
 				if (snap->ds_dir != origin_head->ds_dir)
 					error = SET_ERROR(EINVAL);
 				if (error == 0)  {
 					error = dsl_destroy_snapshot_check_impl(
 					    snap, B_FALSE);
 				}
-				obj = snap->ds_phys->ds_prev_snap_obj;
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_dataset_rele(snap, FTAG);
 				if (error != 0)
 					return (error);
 			}
 		}
 		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
 		    origin_head, drc->drc_force, drc->drc_owner, tx);
 		if (error != 0) {
 			dsl_dataset_rele(origin_head, FTAG);
 			return (error);
 		}
 		error = dsl_dataset_snapshot_check_impl(origin_head,
 		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
 		dsl_dataset_rele(origin_head, FTAG);
 		if (error != 0)
 			return (error);
 
 		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
 	} else {
 		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
 		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
 	}
 	return (error);
 }
 
 static void
 dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
 	    tx, "snap=%s", drc->drc_tosnap);
 
 	if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
 		    &origin_head));
 
 		if (drc->drc_force) {
 			/*
 			 * Destroy any snapshots of drc_tofs (origin_head)
 			 * after the origin (the snap before drc_ds).
 			 */
-			uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj;
-			while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) {
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap));
 				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
-				obj = snap->ds_phys->ds_prev_snap_obj;
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_destroy_snapshot_sync_impl(snap,
 				    B_FALSE, tx);
 				dsl_dataset_rele(snap, FTAG);
 			}
 		}
 		VERIFY3P(drc->drc_ds->ds_prev, ==,
 		    origin_head->ds_prev);
 
 		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
 		    origin_head, tx);
 		dsl_dataset_snapshot_sync_impl(origin_head,
 		    drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
-		origin_head->ds_prev->ds_phys->ds_creation_time =
+		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
-		origin_head->ds_prev->ds_phys->ds_guid =
+		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
-		origin_head->ds_prev->ds_phys->ds_flags &=
+		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
-		origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		dsl_dataset_phys(origin_head)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
 
 		dsl_dataset_rele(origin_head, FTAG);
 		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
 
 		if (drc->drc_owner != NULL)
 			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
 	} else {
 		dsl_dataset_t *ds = drc->drc_ds;
 
 		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-		ds->ds_prev->ds_phys->ds_creation_time =
+		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
-		ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid;
-		ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		dsl_dataset_phys(ds->ds_prev)->ds_guid =
+		    drc->drc_drrb->drr_toguid;
+		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
 	}
-	drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj;
+	drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
 	/*
 	 * Release the hold from dmu_recv_begin.  This must be done before
 	 * we return to open context, so that when we free the dataset's dnode,
 	 * we can evict its bonus buffer.
 	 */
 	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
 	drc->drc_ds = NULL;
 }
 
 static int
 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *snapds;
 	guid_map_entry_t *gmep;
 	int err;
 
 	ASSERT(guid_map != NULL);
 
 	err = dsl_pool_hold(name, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
 	err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
 	if (err == 0) {
-		gmep->guid = snapds->ds_phys->ds_guid;
+		gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
 		gmep->gme_ds = snapds;
 		avl_add(guid_map, gmep);
 		dsl_dataset_long_hold(snapds, gmep);
 	} else
 		kmem_free(gmep, sizeof (*gmep));
 
 	dsl_pool_rele(dp, FTAG);
 	return (err);
 }
 
 static int dmu_recv_end_modified_blocks = 3;
 
 static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
 	int error;
 	char name[MAXNAMELEN];
 
 #ifdef _KERNEL
 	/*
 	 * We will be destroying the ds; make sure its origin is unmounted if
 	 * necessary.
 	 */
 	dsl_dataset_name(drc->drc_ds, name);
 	zfs_destroy_unmount_origin(name);
 #endif
 
 	error = dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
 
 	if (error != 0)
 		dmu_recv_cleanup_ds(drc);
 	return (error);
 }
 
 static int
 dmu_recv_new_end(dmu_recv_cookie_t *drc)
 {
 	int error;
 
 	error = dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
 
 	if (error != 0) {
 		dmu_recv_cleanup_ds(drc);
 	} else if (drc->drc_guid_to_ds_map != NULL) {
 		(void) add_ds_to_guidmap(drc->drc_tofs,
 		    drc->drc_guid_to_ds_map,
 		    drc->drc_newsnapobj);
 	}
 	return (error);
 }
 
 int
 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 {
 	drc->drc_owner = owner;
 
 	if (drc->drc_newfs)
 		return (dmu_recv_new_end(drc));
 	else
 		return (dmu_recv_existing_end(drc));
 }
 
 /*
  * Return TRUE if this objset is currently being received into.
  */
 boolean_t
 dmu_objset_is_receiving(objset_t *os)
 {
 	return (os->os_dsl_dataset != NULL &&
 	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c	(revision 275782)
@@ -1,648 +1,648 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dnode.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/callb.h>
 #include <sys/zfeature.h>
 
 int zfs_pd_blks_max = 100;
 
 typedef struct prefetch_data {
 	kmutex_t pd_mtx;
 	kcondvar_t pd_cv;
 	int pd_blks_max;
 	int pd_blks_fetched;
 	int pd_flags;
 	boolean_t pd_cancel;
 	boolean_t pd_exited;
 } prefetch_data_t;
 
 typedef struct traverse_data {
 	spa_t *td_spa;
 	uint64_t td_objset;
 	blkptr_t *td_rootbp;
 	uint64_t td_min_txg;
 	zbookmark_phys_t *td_resume;
 	int td_flags;
 	prefetch_data_t *td_pfd;
 	boolean_t td_paused;
 	uint64_t td_hole_birth_enabled_txg;
 	blkptr_cb_t *td_func;
 	void *td_arg;
 } traverse_data_t;
 
 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object);
 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
     uint64_t objset, uint64_t object);
 
 static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	traverse_data_t *td = arg;
 	zbookmark_phys_t zb;
 
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
 
 	return (0);
 }
 
 static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
 	traverse_data_t *td = arg;
 
 	if (lrc->lrc_txtype == TX_WRITE) {
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
 		zbookmark_phys_t zb;
 
 		if (BP_IS_HOLE(bp))
 			return (0);
 
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
 			return (0);
 
 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
 		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
 		    td->td_arg);
 	}
 	return (0);
 }
 
 static void
 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 	zilog_t *zilog;
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed; plus, in read-only mode, blocks that are already stable.
 	 */
 	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
 
 	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 
 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 	    claim_txg);
 
 	zil_free(zilog);
 }
 
 typedef enum resume_skip {
 	RESUME_SKIP_ALL,
 	RESUME_SKIP_NONE,
 	RESUME_SKIP_CHILDREN
 } resume_skip_t;
 
 /*
  * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
  * the block indicated by zb does not need to be visited at all. Returns
  * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
  * resume point. This indicates that this block should be visited but not its
  * children (since they must have been visited in a previous traversal).
  * Otherwise returns RESUME_SKIP_NONE.
  */
 static resume_skip_t
 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
     const zbookmark_phys_t *zb)
 {
 	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
 		/*
 		 * If we already visited this bp & everything below,
 		 * don't bother doing it again.
 		 */
 		if (zbookmark_is_before(dnp, zb, td->td_resume))
 			return (RESUME_SKIP_ALL);
 
 		/*
 		 * If we found the block we're trying to resume from, zero
 		 * the bookmark out to indicate that we have resumed.
 		 */
 		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 			bzero(td->td_resume, sizeof (*zb));
 			if (td->td_flags & TRAVERSE_POST)
 				return (RESUME_SKIP_CHILDREN);
 		}
 	}
 	return (RESUME_SKIP_NONE);
 }
 
 static void
 traverse_prefetch_metadata(traverse_data_t *td,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
 
 	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
 		return;
 	/*
 	 * If we are in the process of resuming, don't prefetch, because
 	 * some children will not be needed (and in fact may have already
 	 * been freed).
 	 */
 	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
 		return;
 	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
 		return;
 	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 		return;
 
 	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 }
 
 static boolean_t
 prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
 {
 	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
 	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
 		return (B_FALSE);
 	return (B_TRUE);
 }
 
 static int
 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	zbookmark_phys_t czb;
 	int err = 0;
 	arc_buf_t *buf = NULL;
 	prefetch_data_t *pd = td->td_pfd;
 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
 
 	switch (resume_skip_check(td, dnp, zb)) {
 	case RESUME_SKIP_ALL:
 		return (0);
 	case RESUME_SKIP_CHILDREN:
 		goto post;
 	case RESUME_SKIP_NONE:
 		break;
 	default:
 		ASSERT(0);
 	}
 
 	if (bp->blk_birth == 0) {
 		/*
 		 * Since this block has a birth time of 0 it must be a
 		 * hole created before the SPA_FEATURE_HOLE_BIRTH
 		 * feature was enabled.  If SPA_FEATURE_HOLE_BIRTH
 		 * was enabled before the min_txg for this traveral we
 		 * know the hole must have been created before the
 		 * min_txg for this traveral, so we can skip it. If
 		 * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
 		 * for this traveral we cannot tell if the hole was
 		 * created before or after the min_txg for this
 		 * traversal, so we cannot skip it.
 		 */
 		if (td->td_hole_birth_enabled_txg < td->td_min_txg)
 			return (0);
 	} else if (bp->blk_birth <= td->td_min_txg) {
 		return (0);
 	}
 
 	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
 		mutex_enter(&pd->pd_mtx);
 		ASSERT(pd->pd_blks_fetched >= 0);
 		while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
 			cv_wait(&pd->pd_cv, &pd->pd_mtx);
 		pd->pd_blks_fetched--;
 		cv_broadcast(&pd->pd_cv);
 		mutex_exit(&pd->pd_mtx);
 	}
 
 	if (BP_IS_HOLE(bp)) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 		if (err != 0)
 			goto post;
 		return (0);
 	}
 
 	if (td->td_flags & TRAVERSE_PRE) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			goto post;
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		uint32_t flags = ARC_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
 			goto post;
 		cbp = buf->b_data;
 
 		for (i = 0; i < epb; i++) {
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			traverse_prefetch_metadata(td, &cbp[i], &czb);
 		}
 
 		/* recursively visitbp() blocks below this */
 		for (i = 0; i < epb; i++) {
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
 			if (err != 0)
 				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
 			goto post;
 		dnp = buf->b_data;
 
 		for (i = 0; i < epb; i++) {
 			prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset,
 			    zb->zb_blkid * epb + i);
 		}
 
 		/* recursively visitbp() blocks below this */
 		for (i = 0; i < epb; i++) {
 			err = traverse_dnode(td, &dnp[i], zb->zb_objset,
 			    zb->zb_blkid * epb + i);
 			if (err != 0)
 				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
 		objset_phys_t *osp;
 		dnode_phys_t *dnp;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
 			goto post;
 
 		osp = buf->b_data;
 		dnp = &osp->os_meta_dnode;
 		prefetch_dnode_metadata(td, dnp, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
 			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
 			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
 			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
 
 		err = traverse_dnode(td, dnp, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			dnp = &osp->os_groupused_dnode;
 			err = traverse_dnode(td, dnp, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);
 		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			dnp = &osp->os_userused_dnode;
 			err = traverse_dnode(td, dnp, zb->zb_objset,
 			    DMU_USERUSED_OBJECT);
 		}
 	}
 
 	if (buf)
 		(void) arc_buf_remove_ref(buf, &buf);
 
 post:
 	if (err == 0 && (td->td_flags & TRAVERSE_POST))
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 
 	if (hard && (err == EIO || err == ECKSUM)) {
 		/*
 		 * Ignore this disk error as requested by the HARD flag,
 		 * and continue traversal.
 		 */
 		err = 0;
 	}
 
 	/*
 	 * If we are stopping here, set td_resume.
 	 */
 	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
 		td->td_resume->zb_objset = zb->zb_objset;
 		td->td_resume->zb_object = zb->zb_object;
 		td->td_resume->zb_level = 0;
 		/*
 		 * If we have stopped on an indirect block (e.g. due to
 		 * i/o error), we have not visited anything below it.
 		 * Set the bookmark to the first level-0 block that we need
 		 * to visit.  This way, the resuming code does not need to
 		 * deal with resuming from indirect blocks.
 		 */
 		td->td_resume->zb_blkid = zb->zb_blkid <<
 		    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
 		td->td_paused = B_TRUE;
 	}
 
 	return (err);
 }
 
 static void
 prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int j;
 	zbookmark_phys_t czb;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
 	}
 }
 
 static int
 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int j, err = 0;
 	zbookmark_phys_t czb;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
 		if (err != 0)
 			break;
 	}
 
 	if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
 	}
 	return (err);
 }
 
 /* ARGSUSED */
 static int
 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	prefetch_data_t *pfd = arg;
 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 
 	ASSERT(pfd->pd_blks_fetched >= 0);
 	if (pfd->pd_cancel)
 		return (SET_ERROR(EINTR));
 
 	if (!prefetch_needed(pfd, bp))
 		return (0);
 
 	mutex_enter(&pfd->pd_mtx);
 	while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
 		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
 	pfd->pd_blks_fetched++;
 	cv_broadcast(&pfd->pd_cv);
 	mutex_exit(&pfd->pd_mtx);
 
 	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
 
 	return (0);
 }
 
 static void
 traverse_prefetch_thread(void *arg)
 {
 	traverse_data_t *td_main = arg;
 	traverse_data_t td = *td_main;
 	zbookmark_phys_t czb;
 
 	td.td_func = traverse_prefetcher;
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;
 
 	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
 
 	mutex_enter(&td_main->td_pfd->pd_mtx);
 	td_main->td_pfd->pd_exited = B_TRUE;
 	cv_broadcast(&td_main->td_pfd->pd_cv);
 	mutex_exit(&td_main->td_pfd->pd_mtx);
 }
 
 /*
  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
  * in syncing context).
  */
 static int
 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	traverse_data_t td;
 	prefetch_data_t pd = { 0 };
 	zbookmark_phys_t czb;
 	int err;
 
 	ASSERT(ds == NULL || objset == ds->ds_object);
 	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
 
 	/*
 	 * The data prefetching mechanism (the prefetch thread) is incompatible
 	 * with resuming from a bookmark.
 	 */
 	ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
 
 	td.td_spa = spa;
 	td.td_objset = objset;
 	td.td_rootbp = rootbp;
 	td.td_min_txg = txg_start;
 	td.td_resume = resume;
 	td.td_func = func;
 	td.td_arg = arg;
 	td.td_pfd = &pd;
 	td.td_flags = flags;
 	td.td_paused = B_FALSE;
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 		VERIFY(spa_feature_enabled_txg(spa,
 		    SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
 	} else {
 		td.td_hole_birth_enabled_txg = 0;
 	}
 
 	pd.pd_blks_max = zfs_pd_blks_max;
 	pd.pd_flags = flags;
 	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 
 	/* See comment on ZIL traversal in dsl_scan_visitds. */
 	if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
 		uint32_t flags = ARC_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, td.td_spa, rootbp,
 		    arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
 		if (err != 0)
 			return (err);
 
 		osp = buf->b_data;
 		traverse_zil(&td, &osp->os_zil_header);
 		(void) arc_buf_remove_ref(buf, &buf);
 	}
 
 	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
 	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
 	    &td, TQ_NOQUEUE))
 		pd.pd_exited = B_TRUE;
 
 	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	err = traverse_visitbp(&td, NULL, rootbp, &czb);
 
 	mutex_enter(&pd.pd_mtx);
 	pd.pd_cancel = B_TRUE;
 	cv_broadcast(&pd.pd_cv);
 	while (!pd.pd_exited)
 		cv_wait(&pd.pd_cv, &pd.pd_mtx);
 	mutex_exit(&pd.pd_mtx);
 
 	mutex_destroy(&pd.pd_mtx);
 	cv_destroy(&pd.pd_cv);
 
 	return (err);
 }
 
 /*
  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
  * in syncing context).
  */
 int
 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
-	    &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg));
+	    &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg));
 }
 
 int
 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
 	    blkptr, txg_start, resume, flags, func, arg));
 }
 
 /*
  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
  */
 int
 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
 	int err;
 	uint64_t obj;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
 	boolean_t hard = (flags & TRAVERSE_HARD);
 
 	/* visit the MOS */
 	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
 	    txg_start, NULL, flags, func, arg);
 	if (err != 0)
 		return (err);
 
 	/* visit each dataset */
 	for (obj = 1; err == 0;
 	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
 		if (err != 0) {
 			if (hard)
 				continue;
 			break;
 		}
 
 		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
 			dsl_dataset_t *ds;
 			uint64_t txg = txg_start;
 
 			dsl_pool_config_enter(dp, FTAG);
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			dsl_pool_config_exit(dp, FTAG);
 			if (err != 0) {
 				if (hard)
 					continue;
 				break;
 			}
-			if (ds->ds_phys->ds_prev_snap_txg > txg)
-				txg = ds->ds_phys->ds_prev_snap_txg;
+			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
+				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
 			if (err != 0)
 				break;
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
 	return (err);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	(revision 275782)
@@ -1,1655 +1,1657 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
 #include <sys/dsl_pool.h>
 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
 #include <sys/spa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/varargs.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
 
 
 dmu_tx_t *
 dmu_tx_create_dd(dsl_dir_t *dd)
 {
 	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
 	tx->tx_dir = dd;
 	if (dd != NULL)
 		tx->tx_pool = dd->dd_pool;
 	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
 	    offsetof(dmu_tx_hold_t, txh_node));
 	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
 	    offsetof(dmu_tx_callback_t, dcb_node));
 	tx->tx_start = gethrtime();
 #ifdef ZFS_DEBUG
 	refcount_create(&tx->tx_space_written);
 	refcount_create(&tx->tx_space_freed);
 #endif
 	return (tx);
 }
 
 dmu_tx_t *
 dmu_tx_create(objset_t *os)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
 	tx->tx_objset = os;
 	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
 	return (tx);
 }
 
 dmu_tx_t *
 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
 
 	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
 	tx->tx_pool = dp;
 	tx->tx_txg = txg;
 	tx->tx_anyobj = TRUE;
 
 	return (tx);
 }
 
 int
 dmu_tx_is_syncing(dmu_tx_t *tx)
 {
 	return (tx->tx_anyobj);
 }
 
 int
 dmu_tx_private_ok(dmu_tx_t *tx)
 {
 	return (tx->tx_anyobj);
 }
 
 static dmu_tx_hold_t *
 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
     enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn = NULL;
 	int err;
 
 	if (object != DMU_NEW_OBJECT) {
 		err = dnode_hold(os, object, tx, &dn);
 		if (err) {
 			tx->tx_err = err;
 			return (NULL);
 		}
 
 		if (err == 0 && tx->tx_txg != 0) {
 			mutex_enter(&dn->dn_mtx);
 			/*
 			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 			 * problem, but there's no way for it to happen (for
 			 * now, at least).
 			 */
 			ASSERT(dn->dn_assigned_txg == 0);
 			dn->dn_assigned_txg = tx->tx_txg;
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
 	}
 
 	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 	txh->txh_tx = tx;
 	txh->txh_dnode = dn;
 #ifdef ZFS_DEBUG
 	txh->txh_type = type;
 	txh->txh_arg1 = arg1;
 	txh->txh_arg2 = arg2;
 #endif
 	list_insert_tail(&tx->tx_holds, txh);
 
 	return (txh);
 }
 
 void
 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 {
 	/*
 	 * If we're syncing, they can manipulate any object anyhow, and
 	 * the hold on the dnode_t can cause problems.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		(void) dmu_tx_hold_object_impl(tx, os,
 		    object, THT_NEWOBJECT, 0, 0);
 	}
 }
 
 static int
 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 {
 	int err;
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold_level(dn, level, blkid, FTAG);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 	dbuf_rele(db, FTAG);
 	return (err);
 }
 
 static void
 dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
     int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 {
 	objset_t *os = dn->dn_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	dmu_buf_impl_t *parent = NULL;
 	blkptr_t *bp = NULL;
 	uint64_t space;
 
 	if (level >= dn->dn_nlevels || history[level] == blkid)
 		return;
 
 	history[level] = blkid;
 
 	space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 
 	if (db == NULL || db == dn->dn_dbuf) {
 		ASSERT(level != 0);
 		db = NULL;
 	} else {
 		ASSERT(DB_DNODE(db) == dn);
 		ASSERT(db->db_level == level);
 		ASSERT(db->db.db_size == space);
 		ASSERT(db->db_blkid == blkid);
 		bp = db->db_blkptr;
 		parent = db->db_parent;
 	}
 
 	freeable = (bp && (freeable ||
 	    dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 
 	if (freeable)
 		txh->txh_space_tooverwrite += space;
 	else
 		txh->txh_space_towrite += space;
 	if (bp)
 		txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 
 	dmu_tx_count_twig(txh, dn, parent, level + 1,
 	    blkid >> epbs, freeable, history);
 }
 
 /* ARGSUSED */
 static void
 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	dnode_t *dn = txh->txh_dnode;
 	uint64_t start, end, i;
 	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 	int err = 0;
 
 	if (len == 0)
 		return;
 
 	min_bs = SPA_MINBLOCKSHIFT;
 	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 	min_ibs = DN_MIN_INDBLKSHIFT;
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
 	if (dn) {
 		uint64_t history[DN_MAX_LEVELS];
 		int nlvls = dn->dn_nlevels;
 		int delta;
 
 		/*
 		 * For i/o error checking, read the first and last level-0
 		 * blocks (if they are not aligned), and all the level-1 blocks.
 		 */
 		if (dn->dn_maxblkid == 0) {
 			delta = dn->dn_datablksz;
 			start = (off < dn->dn_datablksz) ? 0 : 1;
 			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 				if (err)
 					goto out;
 				delta -= off;
 			}
 		} else {
 			zio_t *zio = zio_root(dn->dn_objset->os_spa,
 			    NULL, NULL, ZIO_FLAG_CANFAIL);
 
 			/* first level-0 block */
 			start = off >> dn->dn_datablkshift;
 			if (P2PHASE(off, dn->dn_datablksz) ||
 			    len < dn->dn_datablksz) {
 				err = dmu_tx_check_ioerr(zio, dn, 0, start);
 				if (err)
 					goto out;
 			}
 
 			/* last level-0 block */
 			end = (off+len-1) >> dn->dn_datablkshift;
 			if (end != start && end <= dn->dn_maxblkid &&
 			    P2PHASE(off+len, dn->dn_datablksz)) {
 				err = dmu_tx_check_ioerr(zio, dn, 0, end);
 				if (err)
 					goto out;
 			}
 
 			/* level-1 blocks */
 			if (nlvls > 1) {
 				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 				for (i = (start>>shft)+1; i < end>>shft; i++) {
 					err = dmu_tx_check_ioerr(zio, dn, 1, i);
 					if (err)
 						goto out;
 				}
 			}
 
 			err = zio_wait(zio);
 			if (err)
 				goto out;
 			delta = P2NPHASE(off, dn->dn_datablksz);
 		}
 
 		min_ibs = max_ibs = dn->dn_indblkshift;
 		if (dn->dn_maxblkid > 0) {
 			/*
 			 * The blocksize can't change,
 			 * so we can make a more precise estimate.
 			 */
 			ASSERT(dn->dn_datablkshift != 0);
 			min_bs = max_bs = dn->dn_datablkshift;
 		} else {
 			/*
 			 * The blocksize can increase up to the recordsize,
 			 * or if it is already more than the recordsize,
 			 * up to the next power of 2.
 			 */
 			min_bs = highbit64(dn->dn_datablksz - 1);
 			max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
 		}
 
 		/*
 		 * If this write is not off the end of the file
 		 * we need to account for overwrites/unref.
 		 */
 		if (start <= dn->dn_maxblkid) {
 			for (int l = 0; l < DN_MAX_LEVELS; l++)
 				history[l] = -1ULL;
 		}
 		while (start <= dn->dn_maxblkid) {
 			dmu_buf_impl_t *db;
 
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 			rw_exit(&dn->dn_struct_rwlock);
 
 			if (err) {
 				txh->txh_tx->tx_err = err;
 				return;
 			}
 
 			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 			    history);
 			dbuf_rele(db, FTAG);
 			if (++start > end) {
 				/*
 				 * Account for new indirects appearing
 				 * before this IO gets assigned into a txg.
 				 */
 				bits = 64 - min_bs;
 				epbs = min_ibs - SPA_BLKPTRSHIFT;
 				for (bits -= epbs * (nlvls - 1);
 				    bits >= 0; bits -= epbs)
 					txh->txh_fudge += 1ULL << max_ibs;
 				goto out;
 			}
 			off += delta;
 			if (len >= delta)
 				len -= delta;
 			delta = dn->dn_datablksz;
 		}
 	}
 
 	/*
 	 * 'end' is the last thing we will access, not one past.
 	 * This way we won't overflow when accessing the last byte.
 	 */
 	start = P2ALIGN(off, 1ULL << max_bs);
 	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 	txh->txh_space_towrite += end - start + 1;
 
 	start >>= min_bs;
 	end >>= min_bs;
 
 	epbs = min_ibs - SPA_BLKPTRSHIFT;
 
 	/*
 	 * The object contains at most 2^(64 - min_bs) blocks,
 	 * and each indirect level maps 2^epbs.
 	 */
 	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 		start >>= epbs;
 		end >>= epbs;
 		ASSERT3U(end, >=, start);
 		txh->txh_space_towrite += (end - start + 1) << max_ibs;
 		if (start != 0) {
 			/*
 			 * We also need a new blkid=0 indirect block
 			 * to reference any existing file data.
 			 */
 			txh->txh_space_towrite += 1ULL << max_ibs;
 		}
 	}
 
 out:
 	if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 	    2 * DMU_MAX_ACCESS)
 		err = SET_ERROR(EFBIG);
 
 	if (err)
 		txh->txh_tx->tx_err = err;
 }
 
 static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
 	dnode_t *dn = txh->txh_dnode;
 	dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 	uint64_t space = mdn->dn_datablksz +
 	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 
 	if (dn && dn->dn_dbuf->db_blkptr &&
 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 	    dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 		txh->txh_space_tooverwrite += space;
 		txh->txh_space_tounref += space;
 	} else {
 		txh->txh_space_towrite += space;
 		if (dn && dn->dn_dbuf->db_blkptr)
 			txh->txh_space_tounref += space;
 	}
 }
 
 void
 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(len < DMU_MAX_ACCESS);
 	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_WRITE, off, len);
 	if (txh == NULL)
 		return;
 
 	dmu_tx_count_write(txh, off, len);
 	dmu_tx_count_dnode(txh);
 }
 
 static void
 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	uint64_t blkid, nblks, lastblk;
 	uint64_t space = 0, unref = 0, skipped = 0;
 	dnode_t *dn = txh->txh_dnode;
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 	int epbs;
 	uint64_t l0span = 0, nl1blks = 0;
 
 	if (dn->dn_nlevels == 0)
 		return;
 
 	/*
 	 * The struct_rwlock protects us against dn_nlevels
 	 * changing, in case (against all odds) we manage to dirty &
 	 * sync out the changes after we check for being dirty.
 	 * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	if (dn->dn_maxblkid == 0) {
 		if (off == 0 && len >= dn->dn_datablksz) {
 			blkid = 0;
 			nblks = 1;
 		} else {
 			rw_exit(&dn->dn_struct_rwlock);
 			return;
 		}
 	} else {
 		blkid = off >> dn->dn_datablkshift;
 		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 
 		if (blkid > dn->dn_maxblkid) {
 			rw_exit(&dn->dn_struct_rwlock);
 			return;
 		}
 		if (blkid + nblks > dn->dn_maxblkid)
 			nblks = dn->dn_maxblkid - blkid + 1;
 
 	}
 	l0span = nblks;    /* save for later use to calc level > 1 overhead */
 	if (dn->dn_nlevels == 1) {
 		int i;
 		for (i = 0; i < nblks; i++) {
 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
 			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 			bp += blkid + i;
 			if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
 				space += bp_get_dsize(spa, bp);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
 		nl1blks = 1;
 		nblks = 0;
 	}
 
 	lastblk = blkid + nblks - 1;
 	while (nblks) {
 		dmu_buf_impl_t *dbuf;
 		uint64_t ibyte, new_blkid;
 		int epb = 1 << epbs;
 		int err, i, blkoff, tochk;
 		blkptr_t *bp;
 
 		ibyte = blkid << dn->dn_datablkshift;
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 		new_blkid = ibyte >> dn->dn_datablkshift;
 		if (err == ESRCH) {
 			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 			break;
 		}
 		if (err) {
 			txh->txh_tx->tx_err = err;
 			break;
 		}
 		if (new_blkid > lastblk) {
 			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 			break;
 		}
 
 		if (new_blkid > blkid) {
 			ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 			skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 			nblks -= new_blkid - blkid;
 			blkid = new_blkid;
 		}
 		blkoff = P2PHASE(blkid, epb);
 		tochk = MIN(epb - blkoff, nblks);
 
 		err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 		if (err) {
 			txh->txh_tx->tx_err = err;
 			break;
 		}
 
 		txh->txh_memory_tohold += dbuf->db.db_size;
 
 		/*
 		 * We don't check memory_tohold against DMU_MAX_ACCESS because
 		 * memory_tohold is an over-estimation (especially the >L1
 		 * indirect blocks), so it could fail.  Callers should have
 		 * already verified that they will not be holding too much
 		 * memory.
 		 */
 
 		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 		if (err != 0) {
 			txh->txh_tx->tx_err = err;
 			dbuf_rele(dbuf, FTAG);
 			break;
 		}
 
 		bp = dbuf->db.db_data;
 		bp += blkoff;
 
 		for (i = 0; i < tochk; i++) {
 			if (dsl_dataset_block_freeable(ds, &bp[i],
 			    bp[i].blk_birth)) {
 				dprintf_bp(&bp[i], "can free old%s", "");
 				space += bp_get_dsize(spa, &bp[i]);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
 		dbuf_rele(dbuf, FTAG);
 
 		++nl1blks;
 		blkid += tochk;
 		nblks -= tochk;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/*
 	 * Add in memory requirements of higher-level indirects.
 	 * This assumes a worst-possible scenario for dn_nlevels and a
 	 * worst-possible distribution of l1-blocks over the region to free.
 	 */
 	{
 		uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 		int level = 2;
 		/*
 		 * Here we don't use DN_MAX_LEVEL, but calculate it with the
 		 * given datablkshift and indblkshift. This makes the
 		 * difference between 19 and 8 on large files.
 		 */
 		int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 
 		while (level++ < maxlevel) {
 			txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 			    << dn->dn_indblkshift;
 			blkcnt = 1 + (blkcnt >> epbs);
 		}
 	}
 
 	/* account for new level 1 indirect blocks that might show up */
 	if (skipped > 0) {
 		txh->txh_fudge += skipped << dn->dn_indblkshift;
 		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 		txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 	}
 	txh->txh_space_tofree += space;
 	txh->txh_space_tounref += unref;
 }
 
 /*
  * This function marks the transaction as being a "net free".  The end
  * result is that refquotas will be disabled for this transaction, and
  * this transaction will be able to use half of the pool space overhead
  * (see dsl_pool_adjustedsize()).  Therefore this function should only
  * be called for transactions that we expect will not cause a net increase
  * in the amount of space used (but it's OK if that is occasionally not true).
  */
 void
 dmu_tx_mark_netfree(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    DMU_NEW_OBJECT, THT_FREE, 0, 0);
 
 	/*
 	 * Pretend that this operation will free 1GB of space.  This
 	 * should be large enough to cancel out the largest write.
 	 * We don't want to use something like UINT64_MAX, because that would
 	 * cause overflows when doing math with these values (e.g. in
 	 * dmu_tx_try_assign()).
 	 */
 	txh->txh_space_tofree = txh->txh_space_tounref = 1024 * 1024 * 1024;
 }
 
 void
 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
 	int err;
 	zio_t *zio;
 
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_FREE, off, len);
 	if (txh == NULL)
 		return;
 	dn = txh->txh_dnode;
 	dmu_tx_count_dnode(txh);
 
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 
 
 	/*
 	 * For i/o error checking, we read the first and last level-0
 	 * blocks if they are not aligned, and all the level-1 blocks.
 	 *
 	 * Note:  dbuf_free_range() assumes that we have not instantiated
 	 * any level-0 dbufs that will be completely freed.  Therefore we must
 	 * exercise care to not read or count the first and last blocks
 	 * if they are blocksize-aligned.
 	 */
 	if (dn->dn_datablkshift == 0) {
 		if (off != 0 || len < dn->dn_datablksz)
 			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
 	} else {
 		/* first block will be modified if it is not aligned */
 		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 			dmu_tx_count_write(txh, off, 1);
 		/* last block will be modified if it is not aligned */
 		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 			dmu_tx_count_write(txh, off+len, 1);
 	}
 
 	/*
 	 * Check level-1 blocks.
 	 */
 	if (dn->dn_nlevels > 1) {
 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 		    SPA_BLKPTRSHIFT;
 		uint64_t start = off >> shift;
 		uint64_t end = (off + len) >> shift;
 
 		ASSERT(dn->dn_indblkshift != 0);
 
 		/*
 		 * dnode_reallocate() can result in an object with indirect
 		 * blocks having an odd data block size.  In this case,
 		 * just check the single block.
 		 */
 		if (dn->dn_datablkshift == 0)
 			start = end = 0;
 
 		zio = zio_root(tx->tx_pool->dp_spa,
 		    NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (uint64_t i = start; i <= end; i++) {
 			uint64_t ibyte = i << shift;
 			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
 			if (err == ESRCH)
 				break;
 			if (err) {
 				tx->tx_err = err;
 				return;
 			}
 
 			err = dmu_tx_check_ioerr(zio, dn, 1, i);
 			if (err) {
 				tx->tx_err = err;
 				return;
 			}
 		}
 		err = zio_wait(zio);
 		if (err) {
 			tx->tx_err = err;
 			return;
 		}
 	}
 
 	dmu_tx_count_free(txh, off, len);
 }
 
 void
 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
+	dsl_dataset_phys_t *ds_phys;
 	uint64_t nblocks;
 	int epbs, err;
 
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_ZAP, add, (uintptr_t)name);
 	if (txh == NULL)
 		return;
 	dn = txh->txh_dnode;
 
 	dmu_tx_count_dnode(txh);
 
 	if (dn == NULL) {
 		/*
 		 * We will be able to fit a new object's entries into one leaf
 		 * block.  So there will be at most 2 blocks total,
 		 * including the header block.
 		 */
 		dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 		return;
 	}
 
 	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 
 	if (dn->dn_maxblkid == 0 && !add) {
 		blkptr_t *bp;
 
 		/*
 		 * If there is only one block  (i.e. this is a micro-zap)
 		 * and we are not adding anything, the accounting is simple.
 		 */
 		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 		if (err) {
 			tx->tx_err = err;
 			return;
 		}
 
 		/*
 		 * Use max block size here, since we don't know how much
 		 * the size will change between now and the dbuf dirty call.
 		 */
 		bp = &dn->dn_phys->dn_blkptr[0];
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 		    bp, bp->blk_birth))
 			txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
 		else
 			txh->txh_space_towrite += MZAP_MAX_BLKSZ;
 		if (!BP_IS_HOLE(bp))
 			txh->txh_space_tounref += MZAP_MAX_BLKSZ;
 		return;
 	}
 
 	if (dn->dn_maxblkid > 0 && name) {
 		/*
 		 * access the name in this fat-zap so that we'll check
 		 * for i/o errors to the leaf blocks, etc.
 		 */
 		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 		    8, 0, NULL);
 		if (err == EIO) {
 			tx->tx_err = err;
 			return;
 		}
 	}
 
 	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 
 	/*
 	 * If the modified blocks are scattered to the four winds,
 	 * we'll have to modify an indirect twig for each.
 	 */
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	ds_phys = dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
 	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
-		if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
+		if (ds_phys->ds_prev_snap_obj)
 			txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 		else
 			txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 }
 
 void
 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_BONUS, 0, 0);
 	if (txh)
 		dmu_tx_count_dnode(txh);
 }
 
 void
 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 {
 	dmu_tx_hold_t *txh;
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
 
 	txh->txh_space_towrite += space;
 }
 
 int
 dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 {
 	dmu_tx_hold_t *txh;
 	int holds = 0;
 
 	/*
 	 * By asserting that the tx is assigned, we're counting the
 	 * number of dn_tx_holds, which is the same as the number of
 	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
 	 * dn_tx_holds could be 0.
 	 */
 	ASSERT(tx->tx_txg != 0);
 
 	/* if (tx->tx_anyobj == TRUE) */
 		/* return (0); */
 
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 			holds++;
 	}
 
 	return (holds);
 }
 
 #ifdef ZFS_DEBUG
 void
 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 {
 	dmu_tx_hold_t *txh;
 	int match_object = FALSE, match_offset = FALSE;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 	ASSERT3U(dn->dn_object, ==, db->db.db_object);
 
 	if (tx->tx_anyobj) {
 		DB_DNODE_EXIT(db);
 		return;
 	}
 
 	/* XXX No checking on the meta dnode for now */
 	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 		DB_DNODE_EXIT(db);
 		return;
 	}
 
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 			match_object = TRUE;
 		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 			int datablkshift = dn->dn_datablkshift ?
 			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 			int shift = datablkshift + epbs * db->db_level;
 			uint64_t beginblk = shift >= 64 ? 0 :
 			    (txh->txh_arg1 >> shift);
 			uint64_t endblk = shift >= 64 ? 0 :
 			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 			uint64_t blkid = db->db_blkid;
 
 			/* XXX txh_arg2 better not be zero... */
 
 			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 			    txh->txh_type, beginblk, endblk);
 
 			switch (txh->txh_type) {
 			case THT_WRITE:
 				if (blkid >= beginblk && blkid <= endblk)
 					match_offset = TRUE;
 				/*
 				 * We will let this hold work for the bonus
 				 * or spill buffer so that we don't need to
 				 * hold it when creating a new object.
 				 */
 				if (blkid == DMU_BONUS_BLKID ||
 				    blkid == DMU_SPILL_BLKID)
 					match_offset = TRUE;
 				/*
 				 * They might have to increase nlevels,
 				 * thus dirtying the new TLIBs.  Or the
 				 * might have to change the block size,
 				 * thus dirying the new lvl=0 blk=0.
 				 */
 				if (blkid == 0)
 					match_offset = TRUE;
 				break;
 			case THT_FREE:
 				/*
 				 * We will dirty all the level 1 blocks in
 				 * the free range and perhaps the first and
 				 * last level 0 block.
 				 */
 				if (blkid >= beginblk && (blkid <= endblk ||
 				    txh->txh_arg2 == DMU_OBJECT_END))
 					match_offset = TRUE;
 				break;
 			case THT_SPILL:
 				if (blkid == DMU_SPILL_BLKID)
 					match_offset = TRUE;
 				break;
 			case THT_BONUS:
 				if (blkid == DMU_BONUS_BLKID)
 					match_offset = TRUE;
 				break;
 			case THT_ZAP:
 				match_offset = TRUE;
 				break;
 			case THT_NEWOBJECT:
 				match_object = TRUE;
 				break;
 			default:
 				ASSERT(!"bad txh_type");
 			}
 		}
 		if (match_object && match_offset) {
 			DB_DNODE_EXIT(db);
 			return;
 		}
 	}
 	DB_DNODE_EXIT(db);
 	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 	    (u_longlong_t)db->db.db_object, db->db_level,
 	    (u_longlong_t)db->db_blkid);
 }
 #endif
 
 /*
  * If we can't do 10 iops, something is wrong.  Let us go ahead
  * and hit zfs_dirty_data_max.
  */
 hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
 
 /*
  * We delay transactions when we've determined that the backend storage
  * isn't able to accommodate the rate of incoming writes.
  *
  * If there is already a transaction waiting, we delay relative to when
  * that transaction finishes waiting.  This way the calculated min_time
  * is independent of the number of threads concurrently executing
  * transactions.
  *
  * If we are the only waiter, wait relative to when the transaction
  * started, rather than the current time.  This credits the transaction for
  * "time already served", e.g. reading indirect blocks.
  *
  * The minimum time for a transaction to take is calculated as:
  *     min_time = scale * (dirty - min) / (max - dirty)
  *     min_time is then capped at zfs_delay_max_ns.
  *
  * The delay has two degrees of freedom that can be adjusted via tunables.
  * The percentage of dirty data at which we start to delay is defined by
  * zfs_delay_min_dirty_percent. This should typically be at or above
  * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
  * delay after writing at full speed has failed to keep up with the incoming
  * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
  * speaking, this variable determines the amount of delay at the midpoint of
  * the curve.
  *
  * delay
  *  10ms +-------------------------------------------------------------*+
  *       |                                                             *|
  *   9ms +                                                             *+
  *       |                                                             *|
  *   8ms +                                                             *+
  *       |                                                            * |
  *   7ms +                                                            * +
  *       |                                                            * |
  *   6ms +                                                            * +
  *       |                                                            * |
  *   5ms +                                                           *  +
  *       |                                                           *  |
  *   4ms +                                                           *  +
  *       |                                                           *  |
  *   3ms +                                                          *   +
  *       |                                                          *   |
  *   2ms +                                              (midpoint) *    +
  *       |                                                  |    **     |
  *   1ms +                                                  v ***       +
  *       |             zfs_delay_scale ---------->     ********         |
  *     0 +-------------------------------------*********----------------+
  *       0%                    <- zfs_dirty_data_max ->               100%
  *
  * Note that since the delay is added to the outstanding time remaining on the
  * most recent transaction, the delay is effectively the inverse of IOPS.
  * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
  * was chosen such that small changes in the amount of accumulated dirty data
  * in the first 3/4 of the curve yield relatively small differences in the
  * amount of delay.
  *
  * The effects can be easier to understand when the amount of delay is
  * represented on a log scale:
  *
  * delay
  * 100ms +-------------------------------------------------------------++
  *       +                                                              +
  *       |                                                              |
  *       +                                                             *+
  *  10ms +                                                             *+
  *       +                                                           ** +
  *       |                                              (midpoint)  **  |
  *       +                                                  |     **    +
  *   1ms +                                                  v ****      +
  *       +             zfs_delay_scale ---------->        *****         +
  *       |                                             ****             |
  *       +                                          ****                +
  * 100us +                                        **                    +
  *       +                                       *                      +
  *       |                                      *                       |
  *       +                                     *                        +
  *  10us +                                     *                        +
  *       +                                                              +
  *       |                                                              |
  *       +                                                              +
  *       +--------------------------------------------------------------+
  *       0%                    <- zfs_dirty_data_max ->               100%
  *
  * Note here that only as the amount of dirty data approaches its limit does
  * the delay start to increase rapidly. The goal of a properly tuned system
  * should be to keep the amount of dirty data out of that range by first
  * ensuring that the appropriate limits are set for the I/O scheduler to reach
  * optimal throughput on the backend storage, and then by changing the value
  * of zfs_delay_scale to increase the steepness of the curve.
  */
 static void
 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
 {
 	dsl_pool_t *dp = tx->tx_pool;
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 	hrtime_t wakeup, min_tx_time, now;
 
 	if (dirty <= delay_min_bytes)
 		return;
 
 	/*
 	 * The caller has already waited until we are under the max.
 	 * We make them pass us the amount of dirty data so we don't
 	 * have to handle the case of it being >= the max, which could
 	 * cause a divide-by-zero if it's == the max.
 	 */
 	ASSERT3U(dirty, <, zfs_dirty_data_max);
 
 	now = gethrtime();
 	min_tx_time = zfs_delay_scale *
 	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
 	if (now > tx->tx_start + min_tx_time)
 		return;
 
 	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
 
 	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
 	    uint64_t, min_tx_time);
 
 	mutex_enter(&dp->dp_lock);
 	wakeup = MAX(tx->tx_start + min_tx_time,
 	    dp->dp_last_wakeup + min_tx_time);
 	dp->dp_last_wakeup = wakeup;
 	mutex_exit(&dp->dp_lock);
 
 #ifdef _KERNEL
 #ifdef illumos
 	mutex_enter(&curthread->t_delay_lock);
 	while (cv_timedwait_hires(&curthread->t_delay_cv,
 	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
 	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
 		continue;
 	mutex_exit(&curthread->t_delay_lock);
 #else
 	pause_sbt("dmu_tx_delay", wakeup * SBT_1NS,
 	    zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE);
 #endif
 #else
 	hrtime_t delta = wakeup - gethrtime();
 	struct timespec ts;
 	ts.tv_sec = delta / NANOSEC;
 	ts.tv_nsec = delta % NANOSEC;
 	(void) nanosleep(&ts, NULL);
 #endif
 }
 
 static int
 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
 	dmu_tx_hold_t *txh;
 	spa_t *spa = tx->tx_pool->dp_spa;
 	uint64_t memory, asize, fsize, usize;
 	uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 
 	ASSERT0(tx->tx_txg);
 
 	if (tx->tx_err)
 		return (tx->tx_err);
 
 	if (spa_suspended(spa)) {
 		/*
 		 * If the user has indicated a blocking failure mode
 		 * then return ERESTART which will block in dmu_tx_wait().
 		 * Otherwise, return EIO so that an error can get
 		 * propagated back to the VOP calls.
 		 *
 		 * Note that we always honor the txg_how flag regardless
 		 * of the failuremode setting.
 		 */
 		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 		    txg_how != TXG_WAIT)
 			return (SET_ERROR(EIO));
 
 		return (SET_ERROR(ERESTART));
 	}
 
 	if (!tx->tx_waited &&
 	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
 		tx->tx_wait_dirty = B_TRUE;
 		return (SET_ERROR(ERESTART));
 	}
 
 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 	tx->tx_needassign_txh = NULL;
 
 	/*
 	 * NB: No error returns are allowed after txg_hold_open, but
 	 * before processing the dnode holds, due to the
 	 * dmu_tx_unassign() logic.
 	 */
 
 	towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
 		if (dn != NULL) {
 			mutex_enter(&dn->dn_mtx);
 			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 				mutex_exit(&dn->dn_mtx);
 				tx->tx_needassign_txh = txh;
 				return (SET_ERROR(ERESTART));
 			}
 			if (dn->dn_assigned_txg == 0)
 				dn->dn_assigned_txg = tx->tx_txg;
 			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
 		towrite += txh->txh_space_towrite;
 		tofree += txh->txh_space_tofree;
 		tooverwrite += txh->txh_space_tooverwrite;
 		tounref += txh->txh_space_tounref;
 		tohold += txh->txh_memory_tohold;
 		fudge += txh->txh_fudge;
 	}
 
 	/*
 	 * If a snapshot has been taken since we made our estimates,
 	 * assume that we won't be able to free or overwrite anything.
 	 */
 	if (tx->tx_objset &&
 	    dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 	    tx->tx_lastsnap_txg) {
 		towrite += tooverwrite;
 		tooverwrite = tofree = 0;
 	}
 
 	/* needed allocation: worst-case estimate of write space */
 	asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
 	/* freed space estimate: worst-case overwrite + free estimate */
 	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 	/* convert unrefd space to worst-case estimate */
 	usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 	/* calculate memory footprint estimate */
 	memory = towrite + tooverwrite + tohold;
 
 #ifdef ZFS_DEBUG
 	/*
 	 * Add in 'tohold' to account for our dirty holds on this memory
 	 * XXX - the "fudge" factor is to account for skipped blocks that
 	 * we missed because dnode_next_offset() misses in-core-only blocks.
 	 */
 	tx->tx_space_towrite = asize +
 	    spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
 	tx->tx_space_tofree = tofree;
 	tx->tx_space_tooverwrite = tooverwrite;
 	tx->tx_space_tounref = tounref;
 #endif
 
 	if (tx->tx_dir && asize != 0) {
 		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
 		    asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
 		if (err)
 			return (err);
 	}
 
 	return (0);
 }
 
 static void
 dmu_tx_unassign(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
 	if (tx->tx_txg == 0)
 		return;
 
 	txg_rele_to_quiesce(&tx->tx_txgh);
 
 	/*
 	 * Walk the transaction's hold list, removing the hold on the
 	 * associated dnode, and notifying waiters if the refcount drops to 0.
 	 */
 	for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
 
 		if (dn == NULL)
 			continue;
 		mutex_enter(&dn->dn_mtx);
 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 
 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 			dn->dn_assigned_txg = 0;
 			cv_broadcast(&dn->dn_notxholds);
 		}
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	txg_rele_to_sync(&tx->tx_txgh);
 
 	tx->tx_lasttried_txg = tx->tx_txg;
 	tx->tx_txg = 0;
 }
 
 /*
  * Assign tx to a transaction group.  txg_how can be one of:
  *
  * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
  *	a new one.  This should be used when you're not holding locks.
  *	It will only fail if we're truly out of space (or over quota).
  *
  * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
  *	blocking, returns immediately with ERESTART.  This should be used
  *	whenever you're holding locks.  On an ERESTART error, the caller
  *	should drop locks, do a dmu_tx_wait(tx), and try again.
  *
  * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
  *      has already been called on behalf of this operation (though
  *      most likely on a different tx).
  */
 int
 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
 	int err;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
 	    txg_how == TXG_WAITED);
 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 
 	/* If we might wait, we must not hold the config lock. */
 	ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
 
 	if (txg_how == TXG_WAITED)
 		tx->tx_waited = B_TRUE;
 
 	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
 		dmu_tx_unassign(tx);
 
 		if (err != ERESTART || txg_how != TXG_WAIT)
 			return (err);
 
 		dmu_tx_wait(tx);
 	}
 
 	txg_rele_to_quiesce(&tx->tx_txgh);
 
 	return (0);
 }
 
 void
 dmu_tx_wait(dmu_tx_t *tx)
 {
 	spa_t *spa = tx->tx_pool->dp_spa;
 	dsl_pool_t *dp = tx->tx_pool;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(!dsl_pool_config_held(tx->tx_pool));
 
 	if (tx->tx_wait_dirty) {
 		/*
 		 * dmu_tx_try_assign() has determined that we need to wait
 		 * because we've consumed much or all of the dirty buffer
 		 * space.
 		 */
 		mutex_enter(&dp->dp_lock);
 		while (dp->dp_dirty_total >= zfs_dirty_data_max)
 			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
 		uint64_t dirty = dp->dp_dirty_total;
 		mutex_exit(&dp->dp_lock);
 
 		dmu_tx_delay(tx, dirty);
 
 		tx->tx_wait_dirty = B_FALSE;
 
 		/*
 		 * Note: setting tx_waited only has effect if the caller
 		 * used TX_WAIT.  Otherwise they are going to destroy
 		 * this tx and try again.  The common case, zfs_write(),
 		 * uses TX_WAIT.
 		 */
 		tx->tx_waited = B_TRUE;
 	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
 		/*
 		 * If the pool is suspended we need to wait until it
 		 * is resumed.  Note that it's possible that the pool
 		 * has become active after this thread has tried to
 		 * obtain a tx.  If that's the case then tx_lasttried_txg
 		 * would not have been set.
 		 */
 		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
 	} else if (tx->tx_needassign_txh) {
 		/*
 		 * A dnode is assigned to the quiescing txg.  Wait for its
 		 * transaction to complete.
 		 */
 		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
 
 		mutex_enter(&dn->dn_mtx);
 		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
 			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
 		mutex_exit(&dn->dn_mtx);
 		tx->tx_needassign_txh = NULL;
 	} else {
 		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
 	}
 }
 
 void
 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
 {
 #ifdef ZFS_DEBUG
 	if (tx->tx_dir == NULL || delta == 0)
 		return;
 
 	if (delta > 0) {
 		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
 		    tx->tx_space_towrite);
 		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
 	} else {
 		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
 	}
 #endif
 }
 
 void
 dmu_tx_commit(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg != 0);
 
 	/*
 	 * Go through the transaction's hold list and remove holds on
 	 * associated dnodes, notifying waiters if no holds remain.
 	 */
 	while (txh = list_head(&tx->tx_holds)) {
 		dnode_t *dn = txh->txh_dnode;
 
 		list_remove(&tx->tx_holds, txh);
 		kmem_free(txh, sizeof (dmu_tx_hold_t));
 		if (dn == NULL)
 			continue;
 		mutex_enter(&dn->dn_mtx);
 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 
 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 			dn->dn_assigned_txg = 0;
 			cv_broadcast(&dn->dn_notxholds);
 		}
 		mutex_exit(&dn->dn_mtx);
 		dnode_rele(dn, tx);
 	}
 
 	if (tx->tx_tempreserve_cookie)
 		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
 
 	if (!list_is_empty(&tx->tx_callbacks))
 		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
 
 	if (tx->tx_anyobj == FALSE)
 		txg_rele_to_sync(&tx->tx_txgh);
 
 	list_destroy(&tx->tx_callbacks);
 	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
 	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
 	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
 	refcount_destroy_many(&tx->tx_space_written,
 	    refcount_count(&tx->tx_space_written));
 	refcount_destroy_many(&tx->tx_space_freed,
 	    refcount_count(&tx->tx_space_freed));
 #endif
 	kmem_free(tx, sizeof (dmu_tx_t));
 }
 
 void
 dmu_tx_abort(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg == 0);
 
 	while (txh = list_head(&tx->tx_holds)) {
 		dnode_t *dn = txh->txh_dnode;
 
 		list_remove(&tx->tx_holds, txh);
 		kmem_free(txh, sizeof (dmu_tx_hold_t));
 		if (dn != NULL)
 			dnode_rele(dn, tx);
 	}
 
 	/*
 	 * Call any registered callbacks with an error code.
 	 */
 	if (!list_is_empty(&tx->tx_callbacks))
 		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
 
 	list_destroy(&tx->tx_callbacks);
 	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	refcount_destroy_many(&tx->tx_space_written,
 	    refcount_count(&tx->tx_space_written));
 	refcount_destroy_many(&tx->tx_space_freed,
 	    refcount_count(&tx->tx_space_freed));
 #endif
 	kmem_free(tx, sizeof (dmu_tx_t));
 }
 
 uint64_t
 dmu_tx_get_txg(dmu_tx_t *tx)
 {
 	ASSERT(tx->tx_txg != 0);
 	return (tx->tx_txg);
 }
 
 dsl_pool_t *
 dmu_tx_pool(dmu_tx_t *tx)
 {
 	ASSERT(tx->tx_pool != NULL);
 	return (tx->tx_pool);
 }
 
 
 void
 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
 {
 	dmu_tx_callback_t *dcb;
 
 	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
 
 	dcb->dcb_func = func;
 	dcb->dcb_data = data;
 
 	list_insert_tail(&tx->tx_callbacks, dcb);
 }
 
 /*
  * Call all the commit callbacks on a list, with a given error code.
  */
 void
 dmu_tx_do_callbacks(list_t *cb_list, int error)
 {
 	dmu_tx_callback_t *dcb;
 
 	while (dcb = list_head(cb_list)) {
 		list_remove(cb_list, dcb);
 		dcb->dcb_func(dcb->dcb_data, error);
 		kmem_free(dcb, sizeof (dmu_tx_callback_t));
 	}
 }
 
 /*
  * Interface to hold a bunch of attributes.
  * used for creating new files.
  * attrsize is the total size of all attributes
  * to be added during object creation
  *
  * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
  */
 
 /*
  * hold necessary attribute name for attribute registration.
  * should be a very rare case where this is needed.  If it does
  * happen it would only happen on the first write to the file system.
  */
 static void
 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
 {
 	int i;
 
 	if (!sa->sa_need_attr_registration)
 		return;
 
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (!sa->sa_attr_table[i].sa_registered) {
 			if (sa->sa_reg_attr_obj)
 				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
 				    B_TRUE, sa->sa_attr_table[i].sa_name);
 			else
 				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
 				    B_TRUE, sa->sa_attr_table[i].sa_name);
 		}
 	}
 }
 
 
 void
 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
 {
 	dnode_t *dn;
 	dmu_tx_hold_t *txh;
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
 	    THT_SPILL, 0, 0);
 
 	dn = txh->txh_dnode;
 
 	if (dn == NULL)
 		return;
 
 	/* If blkptr doesn't exist then add space to towrite */
 	if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 		txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
 	} else {
 		blkptr_t *bp;
 
 		bp = &dn->dn_phys->dn_spill;
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 		    bp, bp->blk_birth))
 			txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
 		else
 			txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
 		if (!BP_IS_HOLE(bp))
 			txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
 	}
 }
 
 void
 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
 {
 	sa_os_t *sa = tx->tx_objset->os_sa;
 
 	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 
 	if (tx->tx_objset->os_sa->sa_master_obj == 0)
 		return;
 
 	if (tx->tx_objset->os_sa->sa_layout_attr_obj)
 		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 	else {
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 	}
 
 	dmu_tx_sa_registration_hold(sa, tx);
 
 	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
 		return;
 
 	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
 	    THT_SPILL, 0, 0);
 }
 
 /*
  * Hold SA attribute
  *
  * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
  *
  * variable_size is the total size of all variable sized attributes
  * passed to this function.  It is not the total size of all
  * variable size attributes that *may* exist on this object.
  */
 void
 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
 {
 	uint64_t object;
 	sa_os_t *sa = tx->tx_objset->os_sa;
 
 	ASSERT(hdl != NULL);
 
 	object = sa_handle_object(hdl);
 
 	dmu_tx_hold_bonus(tx, object);
 
 	if (tx->tx_objset->os_sa->sa_master_obj == 0)
 		return;
 
 	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
 	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 	}
 
 	dmu_tx_sa_registration_hold(sa, tx);
 
 	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
 		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 
 	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
 		ASSERT(tx->tx_txg == 0);
 		dmu_tx_hold_spill(tx, object);
 	} else {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		if (dn->dn_have_spill) {
 			ASSERT(tx->tx_txg == 0);
 			dmu_tx_hold_spill(tx, object);
 		}
 		DB_DNODE_EXIT(db);
 	}
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	(revision 275782)
@@ -1,1968 +1,1968 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/range_tree.h>
 
 static kmem_cache_t *dnode_cache;
 /*
  * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  * turned on when DEBUG is also defined.
  */
 #ifdef	DEBUG
 #define	DNODE_STATS
 #endif	/* DEBUG */
 
 #ifdef	DNODE_STATS
 #define	DNODE_STAT_ADD(stat)			((stat)++)
 #else
 #define	DNODE_STAT_ADD(stat)			/* nothing */
 #endif	/* DNODE_STATS */
 
 static dnode_phys_t dnode_phys_zero;
 
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 
 #ifdef sun
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif
 
 static int
 dbuf_compare(const void *x1, const void *x2)
 {
 	const dmu_buf_impl_t *d1 = x1;
 	const dmu_buf_impl_t *d2 = x2;
 
 	if (d1->db_level < d2->db_level) {
 		return (-1);
 	}
 	if (d1->db_level > d2->db_level) {
 		return (1);
 	}
 
 	if (d1->db_blkid < d2->db_blkid) {
 		return (-1);
 	}
 	if (d1->db_blkid > d2->db_blkid) {
 		return (1);
 	}
 
 	if (d1->db_state < d2->db_state) {
 		return (-1);
 	}
 	if (d1->db_state > d2->db_state) {
 		return (1);
 	}
 
 	ASSERT3S(d1->db_state, !=, DB_SEARCH);
 	ASSERT3S(d2->db_state, !=, DB_SEARCH);
 
 	if ((uintptr_t)d1 < (uintptr_t)d2) {
 		return (-1);
 	}
 	if ((uintptr_t)d1 > (uintptr_t)d2) {
 		return (1);
 	}
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
 {
 	dnode_t *dn = arg;
 	int i;
 
 	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
 
 	/*
 	 * Every dbuf has a reference, and dropping a tracked reference is
 	 * O(number of references), so don't track dn_holds.
 	 */
 	refcount_create_untracked(&dn->dn_holds);
 	refcount_create(&dn->dn_tx_holds);
 	list_link_init(&dn->dn_link);
 
 	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
 	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
 	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
 	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
 	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
 	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
 	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_link_init(&dn->dn_dirty_link[i]);
 		dn->dn_free_ranges[i] = NULL;
 		list_create(&dn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
 	dn->dn_dirtyctx = 0;
 	dn->dn_dirtyctx_firstset = NULL;
 	dn->dn_bonus = NULL;
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_zio = NULL;
 	dn->dn_oldused = 0;
 	dn->dn_oldflags = 0;
 	dn->dn_olduid = 0;
 	dn->dn_oldgid = 0;
 	dn->dn_newuid = 0;
 	dn->dn_newgid = 0;
 	dn->dn_id_flags = 0;
 
 	dn->dn_dbufs_count = 0;
 	dn->dn_unlisted_l0_blkid = 0;
 	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	dn->dn_moved = 0;
 	POINTER_INVALIDATE(&dn->dn_objset);
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dnode_dest(void *arg, void *unused)
 {
 	int i;
 	dnode_t *dn = arg;
 
 	rw_destroy(&dn->dn_struct_rwlock);
 	mutex_destroy(&dn->dn_mtx);
 	mutex_destroy(&dn->dn_dbufs_mtx);
 	cv_destroy(&dn->dn_notxholds);
 	refcount_destroy(&dn->dn_holds);
 	refcount_destroy(&dn->dn_tx_holds);
 	ASSERT(!list_link_active(&dn->dn_link));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 		list_destroy(&dn->dn_dirty_records[i]);
 		ASSERT0(dn->dn_next_nblkptr[i]);
 		ASSERT0(dn->dn_next_nlevels[i]);
 		ASSERT0(dn->dn_next_indblkshift[i]);
 		ASSERT0(dn->dn_next_bonustype[i]);
 		ASSERT0(dn->dn_rm_spillblk[i]);
 		ASSERT0(dn->dn_next_bonuslen[i]);
 		ASSERT0(dn->dn_next_blksz[i]);
 	}
 
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_free_txg);
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT0(dn->dn_dirtyctx);
 	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
 	ASSERT3P(dn->dn_bonus, ==, NULL);
 	ASSERT(!dn->dn_have_spill);
 	ASSERT3P(dn->dn_zio, ==, NULL);
 	ASSERT0(dn->dn_oldused);
 	ASSERT0(dn->dn_oldflags);
 	ASSERT0(dn->dn_olduid);
 	ASSERT0(dn->dn_oldgid);
 	ASSERT0(dn->dn_newuid);
 	ASSERT0(dn->dn_newgid);
 	ASSERT0(dn->dn_id_flags);
 
 	ASSERT0(dn->dn_dbufs_count);
 	ASSERT0(dn->dn_unlisted_l0_blkid);
 	avl_destroy(&dn->dn_dbufs);
 }
 
 void
 dnode_init(void)
 {
 	ASSERT(dnode_cache == NULL);
 	dnode_cache = kmem_cache_create("dnode_t",
 	    sizeof (dnode_t),
 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
 	kmem_cache_set_move(dnode_cache, dnode_move);
 }
 
 void
 dnode_fini(void)
 {
 	kmem_cache_destroy(dnode_cache);
 	dnode_cache = NULL;
 }
 
 
 #ifdef ZFS_DEBUG
 void
 dnode_verify(dnode_t *dn)
 {
 	int drop_struct_lock = FALSE;
 
 	ASSERT(dn->dn_phys);
 	ASSERT(dn->dn_objset);
 	ASSERT(dn->dn_handle->dnh_dnode == dn);
 
 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
 		return;
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
 		int i;
 		ASSERT3U(dn->dn_indblkshift, >=, 0);
 		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
 		if (dn->dn_datablkshift) {
 			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
 			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
 		}
 		ASSERT3U(dn->dn_nlevels, <=, 30);
 		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
 		ASSERT3U(dn->dn_nblkptr, >=, 1);
 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 		ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		ASSERT3U(dn->dn_datablksz, ==,
 		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
 		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
 		    dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		for (i = 0; i < TXG_SIZE; i++) {
 			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
 		}
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
 	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
 	if (dn->dn_dbuf != NULL) {
 		ASSERT3P(dn->dn_phys, ==,
 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
 		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
 	}
 	if (drop_struct_lock)
 		rw_exit(&dn->dn_struct_rwlock);
 }
 #endif
 
 void
 dnode_byteswap(dnode_phys_t *dnp)
 {
 	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
 	int i;
 
 	if (dnp->dn_type == DMU_OT_NONE) {
 		bzero(dnp, sizeof (dnode_phys_t));
 		return;
 	}
 
 	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
 	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
 	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
 	dnp->dn_used = BSWAP_64(dnp->dn_used);
 
 	/*
 	 * dn_nblkptr is only one byte, so it's OK to read it in either
 	 * byte order.  We can't read dn_bouslen.
 	 */
 	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
 	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
 	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
 		buf64[i] = BSWAP_64(buf64[i]);
 
 	/*
 	 * OK to check dn_bonuslen for zero, because it won't matter if
 	 * we have the wrong byte order.  This is necessary because the
 	 * dnode dnode is smaller than a regular dnode.
 	 */
 	if (dnp->dn_bonuslen != 0) {
 		/*
 		 * Note that the bonus length calculated here may be
 		 * longer than the actual bonus buffer.  This is because
 		 * we always put the bonus buffer after the last block
 		 * pointer (instead of packing it against the end of the
 		 * dnode buffer).
 		 */
 		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
 		size_t len = DN_MAX_BONUSLEN - off;
 		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
 		dmu_object_byteswap_t byteswap =
 		    DMU_OT_BYTESWAP(dnp->dn_bonustype);
 		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
 	}
 
 	/* Swap SPILL block if we have one */
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 		byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
 
 }
 
 void
 dnode_buf_byteswap(void *vbuf, size_t size)
 {
 	dnode_phys_t *buf = vbuf;
 	int i;
 
 	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
 	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 
 	size >>= DNODE_SHIFT;
 	for (i = 0; i < size; i++) {
 		dnode_byteswap(buf);
 		buf++;
 	}
 }
 
 void
 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 {
 	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 
 	dnode_setdirty(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
 	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
 	dn->dn_bonuslen = newsize;
 	if (newsize == 0)
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
 	else
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 void
 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
 {
 	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 	dnode_setdirty(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dn->dn_bonustype = newtype;
 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 void
 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 	dnode_setdirty(dn, tx);
 	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
 	dn->dn_have_spill = B_FALSE;
 }
 
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
 	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 	dn->dn_datablksz = size;
 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 }
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
 	dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 
 	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 	dn->dn_moved = 0;
 
 	/*
 	 * Defer setting dn_objset until the dnode is ready to be a candidate
 	 * for the dnode_move() callback.
 	 */
 	dn->dn_object = object;
 	dn->dn_dbuf = db;
 	dn->dn_handle = dnh;
 	dn->dn_phys = dnp;
 
 	if (dnp->dn_datablkszsec) {
 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 	} else {
 		dn->dn_datablksz = 0;
 		dn->dn_datablkszsec = 0;
 		dn->dn_datablkshift = 0;
 	}
 	dn->dn_indblkshift = dnp->dn_indblkshift;
 	dn->dn_nlevels = dnp->dn_nlevels;
 	dn->dn_type = dnp->dn_type;
 	dn->dn_nblkptr = dnp->dn_nblkptr;
 	dn->dn_checksum = dnp->dn_checksum;
 	dn->dn_compress = dnp->dn_compress;
 	dn->dn_bonustype = dnp->dn_bonustype;
 	dn->dn_bonuslen = dnp->dn_bonuslen;
 	dn->dn_maxblkid = dnp->dn_maxblkid;
 	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 	dn->dn_id_flags = 0;
 
 	dmu_zfetch_init(&dn->dn_zfetch, dn);
 
 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	mutex_enter(&os->os_lock);
 	list_insert_head(&os->os_dnodes, dn);
 	membar_producer();
 	/*
 	 * Everything else must be valid before assigning dn_objset makes the
 	 * dnode eligible for dnode_move().
 	 */
 	dn->dn_objset = os;
 	mutex_exit(&os->os_lock);
 
 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 	return (dn);
 }
 
 /*
  * Caller must be holding the dnode handle, which is released upon return.
  */
 static void
 dnode_destroy(dnode_t *dn)
 {
 	objset_t *os = dn->dn_objset;
 
 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
 	mutex_enter(&os->os_lock);
 	POINTER_INVALIDATE(&dn->dn_objset);
 	list_remove(&os->os_dnodes, dn);
 	mutex_exit(&os->os_lock);
 
 	/* the dnode can no longer move, so we can release the handle */
 	zrl_remove(&dn->dn_handle->dnh_zrlock);
 
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
 
 	dn->dn_dirtyctx = 0;
 	if (dn->dn_dirtyctx_firstset != NULL) {
 		kmem_free(dn->dn_dirtyctx_firstset, 1);
 		dn->dn_dirtyctx_firstset = NULL;
 	}
 	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
 		dbuf_evict(dn->dn_bonus);
 		dn->dn_bonus = NULL;
 	}
 	dn->dn_zio = NULL;
 
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_oldused = 0;
 	dn->dn_oldflags = 0;
 	dn->dn_olduid = 0;
 	dn->dn_oldgid = 0;
 	dn->dn_newuid = 0;
 	dn->dn_newgid = 0;
 	dn->dn_id_flags = 0;
 	dn->dn_unlisted_l0_blkid = 0;
 
 	dmu_zfetch_rele(&dn->dn_zfetch);
 	kmem_cache_free(dnode_cache, dn);
 	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
 }
 
 void
 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	int i;
 
 	ASSERT3U(blocksize, <=,
 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (blocksize == 0)
 		blocksize = 1 << zfs_default_bs;
 	else
 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
 	if (ibs == 0)
 		ibs = zfs_default_ibs;
 
 	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 
 	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
 	    dn->dn_object, tx->tx_txg, blocksize, ibs);
 
 	ASSERT(dn->dn_type == DMU_OT_NONE);
 	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 	ASSERT(ot != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(ot));
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 	ASSERT(dn->dn_type == DMU_OT_NONE);
 	ASSERT0(dn->dn_maxblkid);
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
 	ASSERT(avl_is_empty(&dn->dn_dbufs));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(dn->dn_next_nblkptr[i]);
 		ASSERT0(dn->dn_next_nlevels[i]);
 		ASSERT0(dn->dn_next_indblkshift[i]);
 		ASSERT0(dn->dn_next_bonuslen[i]);
 		ASSERT0(dn->dn_next_bonustype[i]);
 		ASSERT0(dn->dn_rm_spillblk[i]);
 		ASSERT0(dn->dn_next_blksz[i]);
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 	}
 
 	dn->dn_type = ot;
 	dnode_setdblksz(dn, blocksize);
 	dn->dn_indblkshift = ibs;
 	dn->dn_nlevels = 1;
 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 		dn->dn_nblkptr = 1;
 	else
 		dn->dn_nblkptr = 1 +
 		    ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	dn->dn_dirtyctx = 0;
 
 	dn->dn_free_txg = 0;
 	if (dn->dn_dirtyctx_firstset) {
 		kmem_free(dn->dn_dirtyctx_firstset, 1);
 		dn->dn_dirtyctx_firstset = NULL;
 	}
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	dn->dn_id_flags = 0;
 
 	dnode_setdirty(dn, tx);
 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 }
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	int nblkptr;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(blocksize, <=,
 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 	ASSERT(tx->tx_txg != 0);
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 
 	/* clean up any unreferenced dbufs */
 	dnode_evict_dbufs(dn);
 
 	dn->dn_id_flags = 0;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_setdirty(dn, tx);
 	if (dn->dn_datablksz != blocksize) {
 		/* change blocksize */
 		ASSERT(dn->dn_maxblkid == 0 &&
 		    (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
 		    dnode_block_freed(dn, 0)));
 		dnode_setdblksz(dn, blocksize);
 		dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 	}
 	if (dn->dn_bonuslen != bonuslen)
 		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
 
 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 		nblkptr = 1;
 	else
 		nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 	if (dn->dn_bonustype != bonustype)
 		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
 	if (dn->dn_nblkptr != nblkptr)
 		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		dbuf_rm_spill(dn, tx);
 		dnode_rm_spill(dn, tx);
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* change type */
 	dn->dn_type = ot;
 
 	/* change bonus size and type */
 	mutex_enter(&dn->dn_mtx);
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_nblkptr = nblkptr;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 
 	/* fix up the bonus db_size */
 	if (dn->dn_bonus) {
 		dn->dn_bonus->db.db_size =
 		    DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
 	}
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 }
 
 #ifdef	DNODE_STATS
 static struct {
 	uint64_t dms_dnode_invalid;
 	uint64_t dms_dnode_recheck1;
 	uint64_t dms_dnode_recheck2;
 	uint64_t dms_dnode_special;
 	uint64_t dms_dnode_handle;
 	uint64_t dms_dnode_rwlock;
 	uint64_t dms_dnode_active;
 } dnode_move_stats;
 #endif	/* DNODE_STATS */
 
 static void
 dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 {
 	int i;
 
 	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
 	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
 
 	/* Copy fields. */
 	ndn->dn_objset = odn->dn_objset;
 	ndn->dn_object = odn->dn_object;
 	ndn->dn_dbuf = odn->dn_dbuf;
 	ndn->dn_handle = odn->dn_handle;
 	ndn->dn_phys = odn->dn_phys;
 	ndn->dn_type = odn->dn_type;
 	ndn->dn_bonuslen = odn->dn_bonuslen;
 	ndn->dn_bonustype = odn->dn_bonustype;
 	ndn->dn_nblkptr = odn->dn_nblkptr;
 	ndn->dn_checksum = odn->dn_checksum;
 	ndn->dn_compress = odn->dn_compress;
 	ndn->dn_nlevels = odn->dn_nlevels;
 	ndn->dn_indblkshift = odn->dn_indblkshift;
 	ndn->dn_datablkshift = odn->dn_datablkshift;
 	ndn->dn_datablkszsec = odn->dn_datablkszsec;
 	ndn->dn_datablksz = odn->dn_datablksz;
 	ndn->dn_maxblkid = odn->dn_maxblkid;
 	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
 	    sizeof (odn->dn_next_nblkptr));
 	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
 	    sizeof (odn->dn_next_nlevels));
 	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
 	    sizeof (odn->dn_next_indblkshift));
 	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
 	    sizeof (odn->dn_next_bonustype));
 	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
 	    sizeof (odn->dn_rm_spillblk));
 	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
 	    sizeof (odn->dn_next_bonuslen));
 	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
 	    sizeof (odn->dn_next_blksz));
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_move_tail(&ndn->dn_dirty_records[i],
 		    &odn->dn_dirty_records[i]);
 	}
 	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
 	    sizeof (odn->dn_free_ranges));
 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
 	ndn->dn_free_txg = odn->dn_free_txg;
 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
 	ndn->dn_dirtyctx = odn->dn_dirtyctx;
 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 	ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
 	refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
 	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
 	ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
 	ndn->dn_bonus = odn->dn_bonus;
 	ndn->dn_have_spill = odn->dn_have_spill;
 	ndn->dn_zio = odn->dn_zio;
 	ndn->dn_oldused = odn->dn_oldused;
 	ndn->dn_oldflags = odn->dn_oldflags;
 	ndn->dn_olduid = odn->dn_olduid;
 	ndn->dn_oldgid = odn->dn_oldgid;
 	ndn->dn_newuid = odn->dn_newuid;
 	ndn->dn_newgid = odn->dn_newgid;
 	ndn->dn_id_flags = odn->dn_id_flags;
 	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
 	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
 	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
 	ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
 	ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
 
 	/*
 	 * Update back pointers. Updating the handle fixes the back pointer of
 	 * every descendant dbuf as well as the bonus dbuf.
 	 */
 	ASSERT(ndn->dn_handle->dnh_dnode == odn);
 	ndn->dn_handle->dnh_dnode = ndn;
 	if (ndn->dn_zfetch.zf_dnode == odn) {
 		ndn->dn_zfetch.zf_dnode = ndn;
 	}
 
 	/*
 	 * Invalidate the original dnode by clearing all of its back pointers.
 	 */
 	odn->dn_dbuf = NULL;
 	odn->dn_handle = NULL;
 	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 	odn->dn_dbufs_count = 0;
 	odn->dn_unlisted_l0_blkid = 0;
 	odn->dn_bonus = NULL;
 	odn->dn_zfetch.zf_dnode = NULL;
 
 	/*
 	 * Set the low bit of the objset pointer to ensure that dnode_move()
 	 * recognizes the dnode as invalid in any subsequent callback.
 	 */
 	POINTER_INVALIDATE(&odn->dn_objset);
 
 	/*
 	 * Satisfy the destructor.
 	 */
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_create(&odn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 		odn->dn_free_ranges[i] = NULL;
 		odn->dn_next_nlevels[i] = 0;
 		odn->dn_next_indblkshift[i] = 0;
 		odn->dn_next_bonustype[i] = 0;
 		odn->dn_rm_spillblk[i] = 0;
 		odn->dn_next_bonuslen[i] = 0;
 		odn->dn_next_blksz[i] = 0;
 	}
 	odn->dn_allocated_txg = 0;
 	odn->dn_free_txg = 0;
 	odn->dn_assigned_txg = 0;
 	odn->dn_dirtyctx = 0;
 	odn->dn_dirtyctx_firstset = NULL;
 	odn->dn_have_spill = B_FALSE;
 	odn->dn_zio = NULL;
 	odn->dn_oldused = 0;
 	odn->dn_oldflags = 0;
 	odn->dn_olduid = 0;
 	odn->dn_oldgid = 0;
 	odn->dn_newuid = 0;
 	odn->dn_newgid = 0;
 	odn->dn_id_flags = 0;
 
 	/*
 	 * Mark the dnode.
 	 */
 	ndn->dn_moved = 1;
 	odn->dn_moved = (uint8_t)-1;
 }
 
 #ifdef sun
 #ifdef	_KERNEL
 /*ARGSUSED*/
 static kmem_cbrc_t
 dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 {
 	dnode_t *odn = buf, *ndn = newbuf;
 	objset_t *os;
 	int64_t refcount;
 	uint32_t dbufs;
 
 	/*
 	 * The dnode is on the objset's list of known dnodes if the objset
 	 * pointer is valid. We set the low bit of the objset pointer when
 	 * freeing the dnode to invalidate it, and the memory patterns written
 	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
 	 * A newly created dnode sets the objset pointer last of all to indicate
 	 * that the dnode is known and in a valid state to be moved by this
 	 * function.
 	 */
 	os = odn->dn_objset;
 	if (!POINTER_IS_VALID(os)) {
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * Ensure that the objset does not go away during the move.
 	 */
 	rw_enter(&os_lock, RW_WRITER);
 	if (os != odn->dn_objset) {
 		rw_exit(&os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * If the dnode is still valid, then so is the objset. We know that no
 	 * valid objset can be freed while we hold os_lock, so we can safely
 	 * ensure that the objset remains in use.
 	 */
 	mutex_enter(&os->os_lock);
 
 	/*
 	 * Recheck the objset pointer in case the dnode was removed just before
 	 * acquiring the lock.
 	 */
 	if (os != odn->dn_objset) {
 		mutex_exit(&os->os_lock);
 		rw_exit(&os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * At this point we know that as long as we hold os->os_lock, the dnode
 	 * cannot be freed and fields within the dnode can be safely accessed.
 	 * The objset listing this dnode cannot go away as long as this dnode is
 	 * on its list.
 	 */
 	rw_exit(&os_lock);
 	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
 		return (KMEM_CBRC_NO);
 	}
 	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
 
 	/*
 	 * Lock the dnode handle to prevent the dnode from obtaining any new
 	 * holds. This also prevents the descendant dbufs and the bonus dbuf
 	 * from accessing the dnode, so that we can discount their holds. The
 	 * handle is safe to access because we know that while the dnode cannot
 	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
 	 * safely move any dnode referenced only by dbufs.
 	 */
 	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
 	 * We need to guarantee that there is a hold for every dbuf in order to
 	 * determine whether the dnode is actively referenced. Falsely matching
 	 * a dbuf to an active hold would lead to an unsafe move. It's possible
 	 * that a thread already having an active dnode hold is about to add a
 	 * dbuf, and we can't compare hold and dbuf counts while the add is in
 	 * progress.
 	 */
 	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
 		zrl_exit(&odn->dn_handle->dnh_zrlock);
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * A dbuf may be removed (evicted) without an active dnode hold. In that
 	 * case, the dbuf count is decremented under the handle lock before the
 	 * dbuf's hold is released. This order ensures that if we count the hold
 	 * after the dbuf is removed but before its hold is released, we will
 	 * treat the unmatched hold as active and exit safely. If we count the
 	 * hold before the dbuf is removed, the hold is discounted, and the
 	 * removal is blocked until the move completes.
 	 */
 	refcount = refcount_count(&odn->dn_holds);
 	ASSERT(refcount >= 0);
 	dbufs = odn->dn_dbufs_count;
 
 	/* We can't have more dbufs than dnode holds. */
 	ASSERT3U(dbufs, <=, refcount);
 	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
 	    uint32_t, dbufs);
 
 	if (refcount > dbufs) {
 		rw_exit(&odn->dn_struct_rwlock);
 		zrl_exit(&odn->dn_handle->dnh_zrlock);
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
 		return (KMEM_CBRC_LATER);
 	}
 
 	rw_exit(&odn->dn_struct_rwlock);
 
 	/*
 	 * At this point we know that anyone with a hold on the dnode is not
 	 * actively referencing it. The dnode is known and in a valid state to
 	 * move. We're holding the locks needed to execute the critical section.
 	 */
 	dnode_move_impl(odn, ndn);
 
 	list_link_replace(&odn->dn_link, &ndn->dn_link);
 	/* If the dnode was safe to move, the refcount cannot have changed. */
 	ASSERT(refcount == refcount_count(&ndn->dn_holds));
 	ASSERT(dbufs == ndn->dn_dbufs_count);
 	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
 	mutex_exit(&os->os_lock);
 
 	return (KMEM_CBRC_YES);
 }
 #endif	/* _KERNEL */
 #endif	/* sun */
 
 void
 dnode_special_close(dnode_handle_t *dnh)
 {
 	dnode_t *dn = dnh->dnh_dnode;
 
 	/*
 	 * Wait for final references to the dnode to clear.  This can
 	 * only happen if the arc is asyncronously evicting state that
 	 * has a hold on this dnode while we are trying to evict this
 	 * dnode.
 	 */
 	while (refcount_count(&dn->dn_holds) > 0)
 		delay(1);
 	zrl_add(&dnh->dnh_zrlock);
 	dnode_destroy(dn); /* implicit zrl_remove() */
 	zrl_destroy(&dnh->dnh_zrlock);
 	dnh->dnh_dnode = NULL;
 }
 
 dnode_t *
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
 	dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
 	dnh->dnh_dnode = dn;
 	zrl_init(&dnh->dnh_zrlock);
 	DNODE_VERIFY(dn);
 	return (dn);
 }
 
 static void
 dnode_buf_pageout(dmu_buf_t *db, void *arg)
 {
 	dnode_children_t *children_dnodes = arg;
 	int i;
 	int epb = db->db_size >> DNODE_SHIFT;
 
 	ASSERT(epb == children_dnodes->dnc_count);
 
 	for (i = 0; i < epb; i++) {
 		dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
 		dnode_t *dn;
 
 		/*
 		 * The dnode handle lock guards against the dnode moving to
 		 * another valid address, so there is no need here to guard
 		 * against changes to or from NULL.
 		 */
 		if (dnh->dnh_dnode == NULL) {
 			zrl_destroy(&dnh->dnh_zrlock);
 			continue;
 		}
 
 		zrl_add(&dnh->dnh_zrlock);
 		dn = dnh->dnh_dnode;
 		/*
 		 * If there are holds on this dnode, then there should
 		 * be holds on the dnode's containing dbuf as well; thus
 		 * it wouldn't be eligible for eviction and this function
 		 * would not have been called.
 		 */
 		ASSERT(refcount_is_zero(&dn->dn_holds));
 		ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 
 		dnode_destroy(dn); /* implicit zrl_remove() */
 		zrl_destroy(&dnh->dnh_zrlock);
 		dnh->dnh_dnode = NULL;
 	}
 	kmem_free(children_dnodes, sizeof (dnode_children_t) +
 	    epb * sizeof (dnode_handle_t));
 }
 
 /*
  * errors:
  * EINVAL - invalid object number.
  * EIO - i/o error.
  * succeeds even for free dnodes.
  */
 int
 dnode_hold_impl(objset_t *os, uint64_t object, int flag,
     void *tag, dnode_t **dnp)
 {
 	int epb, idx, err;
 	int drop_struct_lock = FALSE;
 	int type;
 	uint64_t blk;
 	dnode_t *mdn, *dn;
 	dmu_buf_impl_t *db;
 	dnode_children_t *children_dnodes;
 	dnode_handle_t *dnh;
 
 	/*
 	 * If you are holding the spa config lock as writer, you shouldn't
 	 * be asking the DMU to do *anything* unless it's the root pool
 	 * which may require us to read from the root filesystem while
 	 * holding some (not all) of the locks as writer.
 	 */
 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
 	    (spa_is_root(os->os_spa) &&
 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
 
 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
 		dn = (object == DMU_USERUSED_OBJECT) ?
 		    DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
 		if (dn == NULL)
 			return (SET_ERROR(ENOENT));
 		type = dn->dn_type;
 		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
 			return (SET_ERROR(ENOENT));
 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
 			return (SET_ERROR(EEXIST));
 		DNODE_VERIFY(dn);
 		(void) refcount_add(&dn->dn_holds, tag);
 		*dnp = dn;
 		return (0);
 	}
 
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return (SET_ERROR(EINVAL));
 
 	mdn = DMU_META_DNODE(os);
 	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
 
 	DNODE_VERIFY(mdn);
 
 	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
 		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 
 	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
 
 	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
 		rw_exit(&mdn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
 	if (err) {
 		dbuf_rele(db, FTAG);
 		return (err);
 	}
 
 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
 	epb = db->db.db_size >> DNODE_SHIFT;
 
 	idx = object & (epb-1);
 
 	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
 	children_dnodes = dmu_buf_get_user(&db->db);
 	if (children_dnodes == NULL) {
 		int i;
 		dnode_children_t *winner;
 		children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
 		    epb * sizeof (dnode_handle_t), KM_SLEEP);
 		children_dnodes->dnc_count = epb;
 		dnh = &children_dnodes->dnc_children[0];
 		for (i = 0; i < epb; i++) {
 			zrl_init(&dnh[i].dnh_zrlock);
 			dnh[i].dnh_dnode = NULL;
 		}
-		if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
+		if (winner = dmu_buf_set_user(&db->db, children_dnodes,
 		    dnode_buf_pageout)) {
 
 			for (i = 0; i < epb; i++) {
 				zrl_destroy(&dnh[i].dnh_zrlock);
 			}
 
 			kmem_free(children_dnodes, sizeof (dnode_children_t) +
 			    epb * sizeof (dnode_handle_t));
 			children_dnodes = winner;
 		}
 	}
 	ASSERT(children_dnodes->dnc_count == epb);
 
 	dnh = &children_dnodes->dnc_children[idx];
 	zrl_add(&dnh->dnh_zrlock);
 	if ((dn = dnh->dnh_dnode) == NULL) {
 		dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
 		dnode_t *winner;
 
 		dn = dnode_create(os, phys, db, object, dnh);
 		winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
 		if (winner != NULL) {
 			zrl_add(&dnh->dnh_zrlock);
 			dnode_destroy(dn); /* implicit zrl_remove() */
 			dn = winner;
 		}
 	}
 
 	mutex_enter(&dn->dn_mtx);
 	type = dn->dn_type;
 	if (dn->dn_free_txg ||
 	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
 	    ((flag & DNODE_MUST_BE_FREE) &&
 	    (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
 		mutex_exit(&dn->dn_mtx);
 		zrl_remove(&dnh->dnh_zrlock);
 		dbuf_rele(db, FTAG);
 		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
 	}
 	mutex_exit(&dn->dn_mtx);
 
 	if (refcount_add(&dn->dn_holds, tag) == 1)
 		dbuf_add_ref(db, dnh);
 	/* Now we can rely on the hold to prevent the dnode from moving. */
 	zrl_remove(&dnh->dnh_zrlock);
 
 	DNODE_VERIFY(dn);
 	ASSERT3P(dn->dn_dbuf, ==, db);
 	ASSERT3U(dn->dn_object, ==, object);
 	dbuf_rele(db, FTAG);
 
 	*dnp = dn;
 	return (0);
 }
 
 /*
  * Return held dnode if the object is allocated, NULL if not.
  */
 int
 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
 }
 
 /*
  * Can only add a reference if there is already at least one
  * reference on the dnode.  Returns FALSE if unable to add a
  * new reference.
  */
 boolean_t
 dnode_add_ref(dnode_t *dn, void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
 	if (refcount_is_zero(&dn->dn_holds)) {
 		mutex_exit(&dn->dn_mtx);
 		return (FALSE);
 	}
 	VERIFY(1 < refcount_add(&dn->dn_holds, tag));
 	mutex_exit(&dn->dn_mtx);
 	return (TRUE);
 }
 
 void
 dnode_rele(dnode_t *dn, void *tag)
 {
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
 	dmu_buf_impl_t *db = dn->dn_dbuf;
 	dnode_handle_t *dnh = dn->dn_handle;
 
 	mutex_enter(&dn->dn_mtx);
 	refs = refcount_remove(&dn->dn_holds, tag);
 	mutex_exit(&dn->dn_mtx);
 
 	/*
 	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
 	 * indirectly by dbuf_rele() while relying on the dnode handle to
 	 * prevent the dnode from moving, since releasing the last hold could
 	 * result in the dnode's parent dbuf evicting its dnode handles. For
 	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
 	 * other direct or indirect hold on the dnode must first drop the dnode
 	 * handle.
 	 */
 	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
 
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
 	if (refs == 0 && db != NULL) {
 		/*
 		 * Another thread could add a hold to the dnode handle in
 		 * dnode_hold_impl() while holding the parent dbuf. Since the
 		 * hold on the parent dbuf prevents the handle from being
 		 * destroyed, the hold on the handle is OK. We can't yet assert
 		 * that the handle has zero references, but that will be
 		 * asserted anyway when the handle gets destroyed.
 		 */
 		dbuf_rele(db, dnh);
 	}
 }
 
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
 
 	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		dsl_dataset_dirty(os->os_dsl_dataset, tx);
 		return;
 	}
 
 	DNODE_VERIFY(dn);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&dn->dn_mtx);
 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
 	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
 	mutex_exit(&dn->dn_mtx);
 #endif
 
 	/*
 	 * Determine old uid/gid when necessary
 	 */
 	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
 
 	mutex_enter(&os->os_lock);
 
 	/*
 	 * If we are already marked dirty, we're done.
 	 */
 	if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
 		mutex_exit(&os->os_lock);
 		return;
 	}
 
 	ASSERT(!refcount_is_zero(&dn->dn_holds) ||
 	    !avl_is_empty(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
 	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
 	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
 	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
 
 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
 	    dn->dn_object, txg);
 
 	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
 		list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
 	} else {
 		list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
 	}
 
 	mutex_exit(&os->os_lock);
 
 	/*
 	 * The dnode maintains a hold on its containing dbuf as
 	 * long as there are holds on it.  Each instantiated child
 	 * dbuf maintains a hold on the dnode.  When the last child
 	 * drops its hold, the dnode will drop its hold on the
 	 * containing dbuf. We add a "dirty hold" here so that the
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
 	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
 
 	(void) dbuf_dirty(dn->dn_dbuf, tx);
 
 	dsl_dataset_dirty(os->os_dsl_dataset, tx);
 }
 
 void
 dnode_free(dnode_t *dn, dmu_tx_t *tx)
 {
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
 
 	/* we should be the only holder... hopefully */
 	/* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
 
 	mutex_enter(&dn->dn_mtx);
 	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
 		mutex_exit(&dn->dn_mtx);
 		return;
 	}
 	dn->dn_free_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 
 	/*
 	 * If the dnode is already dirty, it needs to be moved from
 	 * the dirty list to the free list.
 	 */
 	mutex_enter(&dn->dn_objset->os_lock);
 	if (list_link_active(&dn->dn_dirty_link[txgoff])) {
 		list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
 		list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
 		mutex_exit(&dn->dn_objset->os_lock);
 	} else {
 		mutex_exit(&dn->dn_objset->os_lock);
 		dnode_setdirty(dn, tx);
 	}
 }
 
 /*
  * Try to change the block size for the indicated dnode.  This can only
  * succeed if there are no blocks allocated or dirty beyond first block
  */
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	int err;
 
 	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (size == 0)
 		size = SPA_MINBLOCKSIZE;
 	else
 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
 	if (ibs == dn->dn_indblkshift)
 		ibs = 0;
 
 	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
 		return (0);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	/* Check for any allocated blocks beyond the first */
 	if (dn->dn_maxblkid != 0)
 		goto fail;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = avl_first(&dn->dn_dbufs); db != NULL;
 	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
 		    db->db_blkid != DMU_SPILL_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);
 			goto fail;
 		}
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;
 
 	/* resize the old block */
 	err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
 	if (err == 0)
 		dbuf_new_size(db, size, tx);
 	else if (err != ENOENT)
 		goto fail;
 
 	dnode_setdblksz(dn, size);
 	dnode_setdirty(dn, tx);
 	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
 	if (ibs) {
 		dn->dn_indblkshift = ibs;
 		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
 	}
 	/* rele after we have fixed the blocksize in the dnode */
 	if (db)
 		dbuf_rele(db, FTAG);
 
 	rw_exit(&dn->dn_struct_rwlock);
 	return (0);
 
 fail:
 	rw_exit(&dn->dn_struct_rwlock);
 	return (SET_ERROR(ENOTSUP));
 }
 
 /* read-holding callers must not rely on the lock being continuously held */
 void
 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
 {
 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
 	int epbs, new_nlevels;
 	uint64_t sz;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	ASSERT(have_read ?
 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
 	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
 	 * if we have a read-lock, check to see if we need to do any work
 	 * before upgrading to a write-lock.
 	 */
 	if (have_read) {
 		if (blkid <= dn->dn_maxblkid)
 			return;
 
 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
 			rw_exit(&dn->dn_struct_rwlock);
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		}
 	}
 
 	if (blkid <= dn->dn_maxblkid)
 		goto out;
 
 	dn->dn_maxblkid = blkid;
 
 	/*
 	 * Compute the number of levels necessary to support the new maxblkid.
 	 */
 	new_nlevels = 1;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	for (sz = dn->dn_nblkptr;
 	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
 		new_nlevels++;
 
 	if (new_nlevels > dn->dn_nlevels) {
 		int old_nlevels = dn->dn_nlevels;
 		dmu_buf_impl_t *db;
 		list_t *list;
 		dbuf_dirty_record_t *new, *dr, *dr_next;
 
 		dn->dn_nlevels = new_nlevels;
 
 		ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
 		dn->dn_next_nlevels[txgoff] = new_nlevels;
 
 		/* dirty the left indirects */
 		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
 		ASSERT(db != NULL);
 		new = dbuf_dirty(db, tx);
 		dbuf_rele(db, FTAG);
 
 		/* transfer the dirty records to the new indirect */
 		mutex_enter(&dn->dn_mtx);
 		mutex_enter(&new->dt.di.dr_mtx);
 		list = &dn->dn_dirty_records[txgoff];
 		for (dr = list_head(list); dr; dr = dr_next) {
 			dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
 			if (dr->dr_dbuf->db_level != new_nlevels-1 &&
 			    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
 			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
 				ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
 				list_remove(&dn->dn_dirty_records[txgoff], dr);
 				list_insert_tail(&new->dt.di.dr_children, dr);
 				dr->dr_parent = new;
 			}
 		}
 		mutex_exit(&new->dt.di.dr_mtx);
 		mutex_exit(&dn->dn_mtx);
 	}
 
 out:
 	if (have_read)
 		rw_downgrade(&dn->dn_struct_rwlock);
 }
 
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	uint64_t blkoff, blkid, nblks;
 	int blksz, blkshift, head, tail;
 	int trunc = FALSE;
 	int epbs;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	blksz = dn->dn_datablksz;
 	blkshift = dn->dn_datablkshift;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	if (len == DMU_OBJECT_END) {
 		len = UINT64_MAX - off;
 		trunc = TRUE;
 	}
 
 	/*
 	 * First, block align the region to free:
 	 */
 	if (ISP2(blksz)) {
 		head = P2NPHASE(off, blksz);
 		blkoff = P2PHASE(off, blksz);
 		if ((off >> blkshift) > dn->dn_maxblkid)
 			goto out;
 	} else {
 		ASSERT(dn->dn_maxblkid == 0);
 		if (off == 0 && len >= blksz) {
 			/*
 			 * Freeing the whole block; fast-track this request.
 			 * Note that we won't dirty any indirect blocks,
 			 * which is fine because we will be freeing the entire
 			 * file and thus all indirect blocks will be freed
 			 * by free_children().
 			 */
 			blkid = 0;
 			nblks = 1;
 			goto done;
 		} else if (off >= blksz) {
 			/* Freeing past end-of-data */
 			goto out;
 		} else {
 			/* Freeing part of the block. */
 			head = blksz - off;
 			ASSERT3U(head, >, 0);
 		}
 		blkoff = off;
 	}
 	/* zero out any partial block data at the start of the range */
 	if (head) {
 		ASSERT3U(blkoff + head, ==, blksz);
 		if (len < head)
 			head = len;
 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
 		    FTAG, &db) == 0) {
 			caddr_t data;
 
 			/* don't dirty if it isn't on disk and isn't dirty */
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
 				rw_exit(&dn->dn_struct_rwlock);
 				dmu_buf_will_dirty(&db->db, tx);
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				data = db->db.db_data;
 				bzero(data + blkoff, head);
 			}
 			dbuf_rele(db, FTAG);
 		}
 		off += head;
 		len -= head;
 	}
 
 	/* If the range was less than one block, we're done */
 	if (len == 0)
 		goto out;
 
 	/* If the remaining range is past end of file, we're done */
 	if ((off >> blkshift) > dn->dn_maxblkid)
 		goto out;
 
 	ASSERT(ISP2(blksz));
 	if (trunc)
 		tail = 0;
 	else
 		tail = P2PHASE(len, blksz);
 
 	ASSERT0(P2PHASE(off, blksz));
 	/* zero out any partial block data at the end of the range */
 	if (tail) {
 		if (len < tail)
 			tail = len;
 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
 		    TRUE, FTAG, &db) == 0) {
 			/* don't dirty if not on disk and not dirty */
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
 				rw_exit(&dn->dn_struct_rwlock);
 				dmu_buf_will_dirty(&db->db, tx);
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				bzero(db->db.db_data, tail);
 			}
 			dbuf_rele(db, FTAG);
 		}
 		len -= tail;
 	}
 
 	/* If the range did not include a full block, we are done */
 	if (len == 0)
 		goto out;
 
 	ASSERT(IS_P2ALIGNED(off, blksz));
 	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
 	blkid = off >> blkshift;
 	nblks = len >> blkshift;
 	if (trunc)
 		nblks += 1;
 
 	/*
 	 * Dirty the first and last indirect blocks, as they (and/or their
 	 * parents) will need to be written out if they were only
 	 * partially freed.  Interior indirect blocks will be themselves freed,
 	 * by free_children(), so they need not be dirtied.  Note that these
 	 * interior blocks have already been prefetched by dmu_tx_hold_free().
 	 */
 	if (dn->dn_nlevels > 1) {
 		uint64_t first, last;
 
 		first = blkid >> epbs;
 		if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
 			dmu_buf_will_dirty(&db->db, tx);
 			dbuf_rele(db, FTAG);
 		}
 		if (trunc)
 			last = dn->dn_maxblkid >> epbs;
 		else
 			last = (blkid + nblks - 1) >> epbs;
 		if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
 			dmu_buf_will_dirty(&db->db, tx);
 			dbuf_rele(db, FTAG);
 		}
 	}
 
 done:
 	/*
 	 * Add this range to the dnode range list.
 	 * We will finish up this free operation in the syncing phase.
 	 */
 	mutex_enter(&dn->dn_mtx);
 	int txgoff = tx->tx_txg & TXG_MASK;
 	if (dn->dn_free_ranges[txgoff] == NULL) {
 		dn->dn_free_ranges[txgoff] =
 		    range_tree_create(NULL, NULL, &dn->dn_mtx);
 	}
 	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
 	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
 	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
 	    blkid, nblks, tx->tx_txg);
 	mutex_exit(&dn->dn_mtx);
 
 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
 	dnode_setdirty(dn, tx);
 out:
 
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 static boolean_t
 dnode_spill_freed(dnode_t *dn)
 {
 	int i;
 
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
 			break;
 	}
 	mutex_exit(&dn->dn_mtx);
 	return (i < TXG_SIZE);
 }
 
 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
 	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
 	int i;
 
 	if (blkid == DMU_BONUS_BLKID)
 		return (FALSE);
 
 	/*
 	 * If we're in the process of opening the pool, dp will not be
 	 * set yet, but there shouldn't be anything dirty.
 	 */
 	if (dp == NULL)
 		return (FALSE);
 
 	if (dn->dn_free_txg)
 		return (TRUE);
 
 	if (blkid == DMU_SPILL_BLKID)
 		return (dnode_spill_freed(dn));
 
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (dn->dn_free_ranges[i] != NULL &&
 		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
 			break;
 	}
 	mutex_exit(&dn->dn_mtx);
 	return (i < TXG_SIZE);
 }
 
 /* call from syncing context when we actually write/free space for this dnode */
 void
 dnode_diduse_space(dnode_t *dn, int64_t delta)
 {
 	uint64_t space;
 	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
 	    dn, dn->dn_phys,
 	    (u_longlong_t)dn->dn_phys->dn_used,
 	    (longlong_t)delta);
 
 	mutex_enter(&dn->dn_mtx);
 	space = DN_USED_BYTES(dn->dn_phys);
 	if (delta > 0) {
 		ASSERT3U(space + delta, >=, space); /* no overflow */
 	} else {
 		ASSERT3U(space, >=, -delta); /* no underflow */
 	}
 	space += delta;
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
 		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
 	} else {
 		dn->dn_phys->dn_used = space;
 		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
 	}
 	mutex_exit(&dn->dn_mtx);
 }
 
 /*
  * Call when we think we're going to write/free space in open context to track
  * the amount of memory in use by the currently open txg.
  */
 void
 dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	int64_t aspace = spa_get_asize(os->os_spa, space);
 
 	if (ds != NULL) {
 		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
 		dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
 	}
 
 	dmu_tx_willuse_space(tx, aspace);
 }
 
 /*
  * Scans a block at the indicated "level" looking for a hole or data,
  * depending on 'flags'.
  *
  * If level > 0, then we are scanning an indirect block looking at its
  * pointers.  If level == 0, then we are looking at a block of dnodes.
  *
  * If we don't find what we are looking for in the block, we return ESRCH.
  * Otherwise, return with *offset pointing to the beginning (if searching
  * forwards) or end (if searching backwards) of the range covered by the
  * block pointer we matched on (or dnode).
  *
  * The basic search algorithm used below by dnode_next_offset() is to
  * use this function to search up the block tree (widen the search) until
  * we find something (i.e., we don't return ESRCH) and then search back
  * down the tree (narrow the search) until we reach our original search
  * level.
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
 	void *data = NULL;
 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	uint64_t epb = 1ULL << epbs;
 	uint64_t minfill, maxfill;
 	boolean_t hole;
 	int i, inc, error, span;
 
 	dprintf("probing object %llu offset %llx level %d of %u\n",
 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
 
 	hole = ((flags & DNODE_FIND_HOLE) != 0);
 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
 	ASSERT(txg == 0 || !hole);
 
 	if (lvl == dn->dn_phys->dn_nlevels) {
 		error = 0;
 		epb = dn->dn_phys->dn_nblkptr;
 		data = dn->dn_phys->dn_blkptr;
 	} else {
 		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
 		if (error) {
 			if (error != ENOENT)
 				return (error);
 			if (hole)
 				return (0);
 			/*
 			 * This can only happen when we are searching up
 			 * the block tree for data.  We don't really need to
 			 * adjust the offset, as we will just end up looking
 			 * at the pointer to this block in its parent, and its
 			 * going to be unallocated, so we will skip over it.
 			 */
 			return (SET_ERROR(ESRCH));
 		}
 		error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
 		if (error) {
 			dbuf_rele(db, FTAG);
 			return (error);
 		}
 		data = db->db.db_data;
 	}
 
 
 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
 	    db->db_blkptr->blk_birth <= txg ||
 	    BP_IS_HOLE(db->db_blkptr))) {
 		/*
 		 * This can only happen when we are searching up the tree
 		 * and these conditions mean that we need to keep climbing.
 		 */
 		error = SET_ERROR(ESRCH);
 	} else if (lvl == 0) {
 		dnode_phys_t *dnp = data;
 		span = DNODE_SHIFT;
 		ASSERT(dn->dn_type == DMU_OT_DNODE);
 
 		for (i = (*offset >> span) & (blkfill - 1);
 		    i >= 0 && i < blkfill; i += inc) {
 			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
 				break;
 			*offset += (1ULL << span) * inc;
 		}
 		if (i < 0 || i == blkfill)
 			error = SET_ERROR(ESRCH);
 	} else {
 		blkptr_t *bp = data;
 		uint64_t start = *offset;
 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
 		minfill = 0;
 		maxfill = blkfill << ((lvl - 1) * epbs);
 
 		if (hole)
 			maxfill--;
 		else
 			minfill++;
 
 		*offset = *offset >> span;
 		for (i = BF64_GET(*offset, 0, epbs);
 		    i >= 0 && i < epb; i += inc) {
 			if (BP_GET_FILL(&bp[i]) >= minfill &&
 			    BP_GET_FILL(&bp[i]) <= maxfill &&
 			    (hole || bp[i].blk_birth > txg))
 				break;
 			if (inc > 0 || *offset > 0)
 				*offset += inc;
 		}
 		*offset = *offset << span;
 		if (inc < 0) {
 			/* traversing backwards; position offset at the end */
 			ASSERT3U(*offset, <=, start);
 			*offset = MIN(*offset + (1ULL << span) - 1, start);
 		} else if (*offset < start) {
 			*offset = start;
 		}
 		if (i < 0 || i >= epb)
 			error = SET_ERROR(ESRCH);
 	}
 
 	if (db)
 		dbuf_rele(db, FTAG);
 
 	return (error);
 }
 
 /*
  * Find the next hole, data, or sparse region at or after *offset.
  * The value 'blkfill' tells us how many items we expect to find
  * in an L0 data block; this value is 1 for normal objects,
  * DNODES_PER_BLOCK for the meta dnode, and some fraction of
  * DNODES_PER_BLOCK when searching for sparse regions thereof.
  *
  * Examples:
  *
  * dnode_next_offset(dn, flags, offset, 1, 1, 0);
  *	Finds the next/previous hole/data in a file.
  *	Used in dmu_offset_next().
  *
  * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
  *	Finds the next free/allocated dnode an objset's meta-dnode.
  *	Only finds objects that have new contents since txg (ie.
  *	bonus buffer changes and content removal are ignored).
  *	Used in dmu_object_next().
  *
  * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
  *	Used in dmu_object_alloc().
  */
 int
 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
     int minlvl, uint64_t blkfill, uint64_t txg)
 {
 	uint64_t initial_offset = *offset;
 	int lvl, maxlvl;
 	int error = 0;
 
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dn->dn_phys->dn_nlevels == 0) {
 		error = SET_ERROR(ESRCH);
 		goto out;
 	}
 
 	if (dn->dn_datablkshift == 0) {
 		if (*offset < dn->dn_datablksz) {
 			if (flags & DNODE_FIND_HOLE)
 				*offset = dn->dn_datablksz;
 		} else {
 			error = SET_ERROR(ESRCH);
 		}
 		goto out;
 	}
 
 	maxlvl = dn->dn_phys->dn_nlevels;
 
 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
 		if (error != ESRCH)
 			break;
 	}
 
 	while (error == 0 && --lvl >= minlvl) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
 	}
 
 	/*
 	 * There's always a "virtual hole" at the end of the object, even
 	 * if all BP's which physically exist are non-holes.
 	 */
 	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
 	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
 		error = 0;
 	}
 
 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
 	    initial_offset < *offset : initial_offset > *offset))
 		error = SET_ERROR(ESRCH);
 out:
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_exit(&dn->dn_struct_rwlock);
 
 	return (error);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c	(revision 275782)
@@ -1,456 +1,457 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/spa.h>
 #include <sys/dsl_bookmark.h>
 #include <zfs_namecheck.h>
 
 static int
 dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
     dsl_dataset_t **dsp, void *tag, char **shortnamep)
 {
 	char buf[MAXNAMELEN];
 	char *hashp;
 
 	if (strlen(fullname) >= MAXNAMELEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	hashp = strchr(fullname, '#');
 	if (hashp == NULL)
 		return (SET_ERROR(EINVAL));
 
 	*shortnamep = hashp + 1;
 	if (zfs_component_namecheck(*shortnamep, NULL, NULL))
 		return (SET_ERROR(EINVAL));
 	(void) strlcpy(buf, fullname, hashp - fullname + 1);
 	return (dsl_dataset_hold(dp, buf, tag, dsp));
 }
 
 /*
  * Returns ESRCH if bookmark is not found.
  */
 static int
 dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname,
     zfs_bookmark_phys_t *bmark_phys)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t bmark_zapobj = ds->ds_bookmarks;
 	matchtype_t mt;
 	int err;
 
 	if (bmark_zapobj == 0)
 		return (SET_ERROR(ESRCH));
 
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
 
 	err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
 	    sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt,
 	    NULL, 0, NULL);
 
 	return (err == ENOENT ? ESRCH : err);
 }
 
 /*
  * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark
  * does not represents an earlier point in later_ds's timeline.
  *
  * Returns ENOENT if the dataset containing the bookmark does not exist.
  * Returns ESRCH if the dataset exists but the bookmark was not found in it.
  */
 int
 dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname,
     dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp)
 {
 	char *shortname;
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_bmark_lookup(ds, shortname, bmp);
 	if (error == 0 && later_ds != NULL) {
 		if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg))
 			error = SET_ERROR(EXDEV);
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 typedef struct dsl_bookmark_create_arg {
 	nvlist_t *dbca_bmarks;
 	nvlist_t *dbca_errors;
 } dsl_bookmark_create_arg_t;
 
 static int
 dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name,
     dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *bmark_fs;
 	char *shortname;
 	int error;
 	zfs_bookmark_phys_t bmark_phys;
 
 	if (!dsl_dataset_is_snapshot(snapds))
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_bookmark_hold_ds(dp, bookmark_name,
 	    &bmark_fs, FTAG, &shortname);
 	if (error != 0)
 		return (error);
 
 	if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) {
 		dsl_dataset_rele(bmark_fs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = dsl_dataset_bmark_lookup(bmark_fs, shortname,
 	    &bmark_phys);
 	dsl_dataset_rele(bmark_fs, FTAG);
 	if (error == 0)
 		return (SET_ERROR(EEXIST));
 	if (error == ESRCH)
 		return (0);
 	return (error);
 }
 
 static int
 dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_create_arg_t *dbca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int rv = 0;
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
 		return (SET_ERROR(ENOTSUP));
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
 		dsl_dataset_t *snapds;
 		int error;
 
 		/* note: validity of nvlist checked by ioctl layer */
 		error = dsl_dataset_hold(dp, fnvpair_value_string(pair),
 		    FTAG, &snapds);
 		if (error == 0) {
 			error = dsl_bookmark_create_check_impl(snapds,
 			    nvpair_name(pair), tx);
 			dsl_dataset_rele(snapds, FTAG);
 		}
 		if (error != 0) {
 			fnvlist_add_int32(dbca->dbca_errors,
 			    nvpair_name(pair), error);
 			rv = error;
 		}
 	}
 
 	return (rv);
 }
 
 static void
 dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_create_arg_t *dbca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 
 	ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS));
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
 		dsl_dataset_t *snapds, *bmark_fs;
 		zfs_bookmark_phys_t bmark_phys;
 		char *shortname;
 
 		VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair),
 		    FTAG, &snapds));
 		VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
 		    &bmark_fs, FTAG, &shortname));
 		if (bmark_fs->ds_bookmarks == 0) {
 			bmark_fs->ds_bookmarks =
 			    zap_create_norm(mos, U8_TEXTPREP_TOUPPER,
 			    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 			spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
 
 			dsl_dataset_zapify(bmark_fs, tx);
 			VERIFY0(zap_add(mos, bmark_fs->ds_object,
 			    DS_FIELD_BOOKMARK_NAMES,
 			    sizeof (bmark_fs->ds_bookmarks), 1,
 			    &bmark_fs->ds_bookmarks, tx));
 		}
 
-		bmark_phys.zbm_guid = snapds->ds_phys->ds_guid;
-		bmark_phys.zbm_creation_txg = snapds->ds_phys->ds_creation_txg;
+		bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid;
+		bmark_phys.zbm_creation_txg =
+		    dsl_dataset_phys(snapds)->ds_creation_txg;
 		bmark_phys.zbm_creation_time =
-		    snapds->ds_phys->ds_creation_time;
+		    dsl_dataset_phys(snapds)->ds_creation_time;
 
 		VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks,
 		    shortname, sizeof (uint64_t),
 		    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
 		    &bmark_phys, tx));
 
 		spa_history_log_internal_ds(bmark_fs, "bookmark", tx,
 		    "name=%s creation_txg=%llu target_snap=%llu",
 		    shortname,
 		    (longlong_t)bmark_phys.zbm_creation_txg,
 		    (longlong_t)snapds->ds_object);
 
 		dsl_dataset_rele(bmark_fs, FTAG);
 		dsl_dataset_rele(snapds, FTAG);
 	}
 }
 
 /*
  * The bookmarks must all be in the same pool.
  */
 int
 dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors)
 {
 	nvpair_t *pair;
 	dsl_bookmark_create_arg_t dbca;
 
 	pair = nvlist_next_nvpair(bmarks, NULL);
 	if (pair == NULL)
 		return (0);
 
 	dbca.dbca_bmarks = bmarks;
 	dbca.dbca_errors = errors;
 
 	return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
 	    dsl_bookmark_create_sync, &dbca,
 	    fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
 dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl)
 {
 	int err = 0;
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	uint64_t bmark_zapobj = ds->ds_bookmarks;
 	if (bmark_zapobj == 0)
 		return (0);
 
 	for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		char *bmark_name = attr.za_name;
 		zfs_bookmark_phys_t bmark_phys;
 
 		err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys);
 		ASSERT3U(err, !=, ENOENT);
 		if (err != 0)
 			break;
 
 		nvlist_t *out_props = fnvlist_alloc();
 		if (nvlist_exists(props,
 		    zfs_prop_to_name(ZFS_PROP_GUID))) {
 			dsl_prop_nvlist_add_uint64(out_props,
 			    ZFS_PROP_GUID, bmark_phys.zbm_guid);
 		}
 		if (nvlist_exists(props,
 		    zfs_prop_to_name(ZFS_PROP_CREATETXG))) {
 			dsl_prop_nvlist_add_uint64(out_props,
 			    ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg);
 		}
 		if (nvlist_exists(props,
 		    zfs_prop_to_name(ZFS_PROP_CREATION))) {
 			dsl_prop_nvlist_add_uint64(out_props,
 			    ZFS_PROP_CREATION, bmark_phys.zbm_creation_time);
 		}
 
 		fnvlist_add_nvlist(outnvl, bmark_name, out_props);
 		fnvlist_free(out_props);
 	}
 	zap_cursor_fini(&zc);
 	return (err);
 }
 
 /*
  * Retrieve the bookmarks that exist in the specified dataset, and the
  * requested properties of each bookmark.
  *
  * The "props" nvlist specifies which properties are requested.
  * See lzc_get_bookmarks() for the list of valid properties.
  */
 int
 dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_pool_hold(dsname, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
 	err = dsl_get_bookmarks_impl(ds, props, outnvl);
 
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (err);
 }
 
 typedef struct dsl_bookmark_destroy_arg {
 	nvlist_t *dbda_bmarks;
 	nvlist_t *dbda_success;
 	nvlist_t *dbda_errors;
 } dsl_bookmark_destroy_arg_t;
 
 static int
 dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t bmark_zapobj = ds->ds_bookmarks;
 	matchtype_t mt;
 
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
 
 	return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx));
 }
 
 static int
 dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_destroy_arg_t *dbda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int rv = 0;
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
 		return (0);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) {
 		const char *fullname = nvpair_name(pair);
 		dsl_dataset_t *ds;
 		zfs_bookmark_phys_t bm;
 		int error;
 		char *shortname;
 
 		error = dsl_bookmark_hold_ds(dp, fullname, &ds,
 		    FTAG, &shortname);
 		if (error == ENOENT) {
 			/* ignore it; the bookmark is "already destroyed" */
 			continue;
 		}
 		if (error == 0) {
 			error = dsl_dataset_bmark_lookup(ds, shortname, &bm);
 			dsl_dataset_rele(ds, FTAG);
 			if (error == ESRCH) {
 				/*
 				 * ignore it; the bookmark is
 				 * "already destroyed"
 				 */
 				continue;
 			}
 		}
 		if (error == 0) {
 			fnvlist_add_boolean(dbda->dbda_success, fullname);
 		} else {
 			fnvlist_add_int32(dbda->dbda_errors, fullname, error);
 			rv = error;
 		}
 	}
 	return (rv);
 }
 
 static void
 dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_destroy_arg_t *dbda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) {
 		dsl_dataset_t *ds;
 		char *shortname;
 		uint64_t zap_cnt;
 
 		VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
 		    &ds, FTAG, &shortname));
 		VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx));
 
 		/*
 		 * If all of this dataset's bookmarks have been destroyed,
 		 * free the zap object and decrement the feature's use count.
 		 */
 		VERIFY0(zap_count(mos, ds->ds_bookmarks,
 		    &zap_cnt));
 		if (zap_cnt == 0) {
 			dmu_buf_will_dirty(ds->ds_dbuf, tx);
 			VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
 			ds->ds_bookmarks = 0;
 			spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
 			VERIFY0(zap_remove(mos, ds->ds_object,
 			    DS_FIELD_BOOKMARK_NAMES, tx));
 		}
 
 		spa_history_log_internal_ds(ds, "remove bookmark", tx,
 		    "name=%s", shortname);
 
 		dsl_dataset_rele(ds, FTAG);
 	}
 }
 
 /*
  * The bookmarks must all be in the same pool.
  */
 int
 dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors)
 {
 	int rv;
 	dsl_bookmark_destroy_arg_t dbda;
 	nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
 	if (pair == NULL)
 		return (0);
 
 	dbda.dbda_bmarks = bmarks;
 	dbda.dbda_errors = errors;
 	dbda.dbda_success = fnvlist_alloc();
 
 	rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
 	    dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
 	    ZFS_SPACE_CHECK_RESERVED);
 	fnvlist_free(dbda.dbda_success);
 	return (rv);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 275782)
@@ -1,3388 +1,3429 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 RackTop Systems.
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_bookmark.h>
 
 SYSCTL_DECL(_vfs_zfs);
 
 /*
  * The SPA supports block sizes up to 16MB.  However, very large blocks
  * can have an impact on i/o latency (e.g. tying up a spinning disk for
  * ~300ms), and also potentially on the memory allocator.  Therefore,
  * we do not allow the recordsize to be set larger than zfs_max_recordsize
  * (default 1MB).  Larger blocks can be created by changing this tunable,
  * and pools with larger blocks can always be imported and used, regardless
  * of this setting.
  */
 int zfs_max_recordsize = 1 * 1024 * 1024;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
     &zfs_max_recordsize, 0,
     "Maximum block size.  Expect dragons when tuning this.");
 
 #define	SWITCH64(x, y) \
 	{ \
 		uint64_t __tmp = (x); \
 		(x) = (y); \
 		(y) = __tmp; \
 	}
 
 #define	DS_REF_MAX	(1ULL << 62)
 
+extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
+extern inline boolean_t dsl_dataset_is_snapshot(dsl_dataset_t *ds);
+
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
  * layer.  If there's a refreservation, that space has already been
  * partially accounted for in our ancestors.
  */
 static int64_t
 parent_delta(dsl_dataset_t *ds, int64_t delta)
 {
+	dsl_dataset_phys_t *ds_phys;
 	uint64_t old_bytes, new_bytes;
 
 	if (ds->ds_reserved == 0)
 		return (delta);
 
-	old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
-	new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+	ds_phys = dsl_dataset_phys(ds);
+	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
+	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
 
 	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
 	return (new_bytes - old_bytes);
 }
 
 void
 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 	int64_t delta;
 
 	dprintf_bp(bp, "ds=%p", ds);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* It could have been compressed away to nothing */
 	if (BP_IS_HOLE(bp))
 		return;
 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 	if (ds == NULL) {
 		dsl_pool_mos_diduse_space(tx->tx_pool,
 		    used, compressed, uncompressed);
 		return;
 	}
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
-	ds->ds_phys->ds_referenced_bytes += used;
-	ds->ds_phys->ds_compressed_bytes += compressed;
-	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
-	ds->ds_phys->ds_unique_bytes += used;
+	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
+	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
+	dsl_dataset_phys(ds)->ds_unique_bytes += used;
 	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
 		ds->ds_need_large_blocks = B_TRUE;
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 	    compressed, uncompressed, tx);
 	dsl_dir_transfer_space(ds->ds_dir, used - delta,
 	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
 }
 
 int
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
     boolean_t async)
 {
 	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(bp->blk_birth <= tx->tx_txg);
 
 	if (ds == NULL) {
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 		dsl_pool_mos_diduse_space(tx->tx_pool,
 		    -used, -compressed, -uncompressed);
 		return (used);
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
 	ASSERT(!dsl_dataset_is_snapshot(ds));
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
-	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
 		mutex_enter(&ds->ds_lock);
-		ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
 		    !DS_UNIQUE_IS_ACCURATE(ds));
 		delta = parent_delta(ds, -used);
-		ds->ds_phys->ds_unique_bytes -= used;
+		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 		    delta, -compressed, -uncompressed, tx);
 		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
 		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		if (async) {
 			/*
 			 * We are here as part of zio's write done callback,
 			 * which means we're a zio interrupt thread.  We can't
 			 * call dsl_deadlist_insert() now because it may block
 			 * waiting for I/O.  Instead, put bp on the deferred
 			 * queue and let dsl_pool_sync() finish the job.
 			 */
 			bplist_append(&ds->ds_pending_deadlist, bp);
 		} else {
 			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
-		    ds->ds_phys->ds_prev_snap_obj);
-		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
-		if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object && bp->blk_birth >
-		    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
-			ds->ds_prev->ds_phys->ds_unique_bytes += used;
+			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
 		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
 	}
 	mutex_enter(&ds->ds_lock);
-	ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
-	ds->ds_phys->ds_referenced_bytes -= used;
-	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
-	ds->ds_phys->ds_compressed_bytes -= compressed;
-	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
-	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
+	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
+	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
 
 	return (used);
 }
 
 uint64_t
 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 {
 	uint64_t trysnap = 0;
 
 	if (ds == NULL)
 		return (0);
 	/*
 	 * The snapshot creation could fail, but that would cause an
 	 * incorrect FALSE return, which would only result in an
 	 * overestimation of the amount of space that an operation would
 	 * consume, which is OK.
 	 *
 	 * There's also a small window where we could miss a pending
 	 * snapshot, because we could set the sync task in the quiescing
 	 * phase.  So this should only be used as a guess.
 	 */
 	if (ds->ds_trysnap_txg >
 	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 		trysnap = ds->ds_trysnap_txg;
-	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
+	return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
 }
 
 boolean_t
 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
     uint64_t blk_birth)
 {
 	if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
 	    (bp != NULL && BP_IS_HOLE(bp)))
 		return (B_FALSE);
 
 	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 
 	return (B_TRUE);
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 {
 	dsl_dataset_t *ds = dsv;
 
 	ASSERT(ds->ds_owner == NULL);
 
 	unique_remove(ds->ds_fsid_guid);
 
 	if (ds->ds_objset != NULL)
 		dmu_objset_evict(ds->ds_objset);
 
 	if (ds->ds_prev) {
 		dsl_dataset_rele(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
 	bplist_destroy(&ds->ds_pending_deadlist);
-	if (ds->ds_phys->ds_deadlist_obj != 0)
+	if (dsl_dataset_phys(ds)->ds_deadlist_obj != 0)
 		dsl_deadlist_close(&ds->ds_deadlist);
 	if (ds->ds_dir)
 		dsl_dir_rele(ds->ds_dir, ds);
 
 	ASSERT(!list_link_active(&ds->ds_synced_link));
 
 	if (mutex_owned(&ds->ds_lock))
 		mutex_exit(&ds->ds_lock);
 	mutex_destroy(&ds->ds_lock);
 	if (mutex_owned(&ds->ds_opening_lock))
 		mutex_exit(&ds->ds_opening_lock);
 	mutex_destroy(&ds->ds_opening_lock);
 	mutex_destroy(&ds->ds_sendstream_lock);
 	refcount_destroy(&ds->ds_longholds);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
 int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
 	int err;
 	dmu_buf_t *headdbuf;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (ds->ds_snapname[0])
 		return (0);
-	if (ds->ds_phys->ds_next_snap_obj == 0)
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
 		return (0);
 
-	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
 	    FTAG, &headdbuf);
 	if (err != 0)
 		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
 	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 	dmu_buf_rele(headdbuf, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
 
 	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 	    value, mt, NULL, 0, NULL);
 	if (err == ENOTSUP && mt == MT_FIRST)
 		err = zap_lookup(mos, snapobj, name, 8, 1, value);
 	return (err);
 }
 
 int
 dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
     boolean_t adj_cnt)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
 
 	err = zap_remove_norm(mos, snapobj, name, mt, tx);
 	if (err == ENOTSUP && mt == MT_FIRST)
 		err = zap_remove(mos, snapobj, name, tx);
 
 	if (err == 0 && adj_cnt)
 		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 	return (err);
 }
 
 int
 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **dsp)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
 	int err;
 	dmu_object_info_t doi;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 	if (err != 0)
 		return (err);
 
 	/* Make sure dsobj has the correct object type. */
 	dmu_object_info_from_db(dbuf, &doi);
 	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
 		dmu_buf_rele(dbuf, tag);
 		return (SET_ERROR(EINVAL));
 	}
 
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
 		dsl_dataset_t *winner = NULL;
 
 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
-		ds->ds_phys = dbuf->db_data;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
 		refcount_create(&ds->ds_longholds);
 
 		bplist_create(&ds->ds_pending_deadlist);
 		dsl_deadlist_open(&ds->ds_deadlist,
-		    mos, ds->ds_phys->ds_deadlist_obj);
+		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
 
 		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 		    offsetof(dmu_sendarg_t, dsa_link));
 
 		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
 			int zaperr = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
 			if (zaperr != ENOENT) {
 				VERIFY0(zaperr);
 				ds->ds_large_blocks = B_TRUE;
 			}
 		}
 
 		if (err == 0) {
 			err = dsl_dir_hold_obj(dp,
-			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+			    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds,
+			    &ds->ds_dir);
 		}
 		if (err != 0) {
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_opening_lock);
 			mutex_destroy(&ds->ds_sendstream_lock);
 			refcount_destroy(&ds->ds_longholds);
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
 		}
 
 		if (!dsl_dataset_is_snapshot(ds)) {
 			ds->ds_snapname[0] = '\0';
-			if (ds->ds_phys->ds_prev_snap_obj != 0) {
+			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 				err = dsl_dataset_hold_obj(dp,
-				    ds->ds_phys->ds_prev_snap_obj,
+				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
 			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
 				int zaperr = zap_lookup(mos, ds->ds_object,
 				    DS_FIELD_BOOKMARK_NAMES,
 				    sizeof (ds->ds_bookmarks), 1,
 				    &ds->ds_bookmarks);
 				if (zaperr != ENOENT)
 					VERIFY0(zaperr);
 			}
 		} else {
 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 				err = dsl_dataset_get_snapname(ds);
-			if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+			if (err == 0 &&
+			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
 				err = zap_count(
 				    ds->ds_dir->dd_pool->dp_meta_objset,
-				    ds->ds_phys->ds_userrefs_obj,
+				    dsl_dataset_phys(ds)->ds_userrefs_obj,
 				    &ds->ds_userrefs);
 			}
 		}
 
 		if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
 			err = dsl_prop_get_int_ds(ds,
 			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 			    &ds->ds_reserved);
 			if (err == 0) {
 				err = dsl_prop_get_int_ds(ds,
 				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
 				    &ds->ds_quota);
 			}
 		} else {
 			ds->ds_reserved = ds->ds_quota = 0;
 		}
 
 		if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
-		    &ds->ds_phys, dsl_dataset_evict)) != NULL) {
+		    dsl_dataset_evict)) != NULL) {
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			if (ds->ds_prev)
 				dsl_dataset_rele(ds->ds_prev, ds);
 			dsl_dir_rele(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_opening_lock);
 			mutex_destroy(&ds->ds_sendstream_lock);
 			refcount_destroy(&ds->ds_longholds);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err != 0) {
 				dmu_buf_rele(dbuf, tag);
 				return (err);
 			}
 			ds = winner;
 		} else {
 			ds->ds_fsid_guid =
-			    unique_insert(ds->ds_phys->ds_fsid_guid);
+			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
 		}
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
-	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
-	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
+	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
 	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 	*dsp = ds;
 	return (0);
 }
 
 int
 dsl_dataset_hold(dsl_pool_t *dp, const char *name,
     void *tag, dsl_dataset_t **dsp)
 {
 	dsl_dir_t *dd;
 	const char *snapname;
 	uint64_t obj;
 	int err = 0;
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
 	if (err != 0)
 		return (err);
 
 	ASSERT(dsl_pool_config_held(dp));
-	obj = dd->dd_phys->dd_head_dataset_obj;
+	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	if (obj != 0)
 		err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
 	else
 		err = SET_ERROR(ENOENT);
 
 	/* we may be looking for a snapshot */
 	if (err == 0 && snapname != NULL) {
 		dsl_dataset_t *ds;
 
 		if (*snapname++ != '@') {
 			dsl_dataset_rele(*dsp, tag);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
 		dprintf("looking for snapshot '%s'\n", snapname);
 		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
 		if (err == 0)
 			err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
 		dsl_dataset_rele(*dsp, tag);
 
 		if (err == 0) {
 			mutex_enter(&ds->ds_lock);
 			if (ds->ds_snapname[0] == 0)
 				(void) strlcpy(ds->ds_snapname, snapname,
 				    sizeof (ds->ds_snapname));
 			mutex_exit(&ds->ds_lock);
 			*dsp = ds;
 		}
 	}
 
 	dsl_dir_rele(dd, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
     void *tag, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 	if (err != 0)
 		return (err);
 	if (!dsl_dataset_tryown(*dsp, tag)) {
 		dsl_dataset_rele(*dsp, tag);
 		*dsp = NULL;
 		return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
 int
 dsl_dataset_own(dsl_pool_t *dp, const char *name,
     void *tag, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold(dp, name, tag, dsp);
 	if (err != 0)
 		return (err);
 	if (!dsl_dataset_tryown(*dsp, tag)) {
 		dsl_dataset_rele(*dsp, tag);
 		return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
 /*
  * See the comment above dsl_pool_hold() for details.  In summary, a long
  * hold is used to prevent destruction of a dataset while the pool hold
  * is dropped, allowing other concurrent operations (e.g. spa_sync()).
  *
  * The dataset and pool must be held when this function is called.  After it
  * is called, the pool hold may be released while the dataset is still held
  * and accessed.
  */
 void
 dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
 {
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	(void) refcount_add(&ds->ds_longholds, tag);
 }
 
 void
 dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
 {
 	(void) refcount_remove(&ds->ds_longholds, tag);
 }
 
 /* Return B_TRUE if there are any long holds on this dataset. */
 boolean_t
 dsl_dataset_long_held(dsl_dataset_t *ds)
 {
 	return (!refcount_is_zero(&ds->ds_longholds));
 }
 
 void
 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 {
 	if (ds == NULL) {
 		(void) strcpy(name, "mos");
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			(void) strcat(name, "@");
 			/*
 			 * We use a "recursive" mutex so that we
 			 * can call dprintf_ds() with ds_lock held.
 			 */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				mutex_enter(&ds->ds_lock);
 				(void) strcat(name, ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
 			} else {
 				(void) strcat(name, ds->ds_snapname);
 			}
 		}
 	}
 }
 
 void
 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 {
 	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
 	ASSERT3P(ds->ds_owner, ==, tag);
 	ASSERT(ds->ds_dbuf != NULL);
 
 	mutex_enter(&ds->ds_lock);
 	ds->ds_owner = NULL;
 	mutex_exit(&ds->ds_lock);
 	dsl_dataset_long_rele(ds, tag);
 	dsl_dataset_rele(ds, tag);
 }
 
 boolean_t
 dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
 {
 	boolean_t gotit = FALSE;
 
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
 		ds->ds_owner = tag;
 		dsl_dataset_long_hold(ds, tag);
 		gotit = TRUE;
 	}
 	mutex_exit(&ds->ds_lock);
 	return (gotit);
 }
 
 uint64_t
 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dd->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (origin == NULL)
 		origin = dp->dp_origin_snap;
 
 	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
-	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
+	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_flags = flags;
 	dsphys->ds_fsid_guid = unique_create();
 	do {
 		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 		    sizeof (dsphys->ds_guid));
 	} while (dsphys->ds_guid == 0);
 	dsphys->ds_snapnames_zapobj =
 	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 	    DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 
 	if (origin == NULL) {
 		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 	} else {
 		dsl_dataset_t *ohds; /* head of the origin snapshot */
 
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
-		    origin->ds_phys->ds_creation_txg;
+		    dsl_dataset_phys(origin)->ds_creation_txg;
 		dsphys->ds_referenced_bytes =
-		    origin->ds_phys->ds_referenced_bytes;
+		    dsl_dataset_phys(origin)->ds_referenced_bytes;
 		dsphys->ds_compressed_bytes =
-		    origin->ds_phys->ds_compressed_bytes;
+		    dsl_dataset_phys(origin)->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
-		    origin->ds_phys->ds_uncompressed_bytes;
-		dsphys->ds_bp = origin->ds_phys->ds_bp;
+		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
+		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
 
 		/*
 		 * Inherit flags that describe the dataset's contents
 		 * (INCONSISTENT) or properties (Case Insensitive).
 		 */
-		dsphys->ds_flags |= origin->ds_phys->ds_flags &
+		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
 		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
 
 		if (origin->ds_large_blocks)
 			dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
-		origin->ds_phys->ds_num_children++;
+		dsl_dataset_phys(origin)->ds_num_children++;
 
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
+		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
+		    FTAG, &ohds));
 		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 		dsl_dataset_rele(ohds, FTAG);
 
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
-			if (origin->ds_phys->ds_next_clones_obj == 0) {
-				origin->ds_phys->ds_next_clones_obj =
+			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
+				dsl_dataset_phys(origin)->ds_next_clones_obj =
 				    zap_create(mos,
 				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY0(zap_add_int(mos,
-			    origin->ds_phys->ds_next_clones_obj, dsobj, tx));
+			    dsl_dataset_phys(origin)->ds_next_clones_obj,
+			    dsobj, tx));
 		}
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		dd->dd_phys->dd_origin_obj = origin->ds_object;
+		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-			if (origin->ds_dir->dd_phys->dd_clones == 0) {
+			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
-				origin->ds_dir->dd_phys->dd_clones =
+				dsl_dir_phys(origin->ds_dir)->dd_clones =
 				    zap_create(mos,
 				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY0(zap_add_int(mos,
-			    origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+			    dsl_dir_phys(origin->ds_dir)->dd_clones,
+			    dsobj, tx));
 		}
 	}
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_head_dataset_obj = dsobj;
+	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
 
 	return (dsobj);
 }
 
 static void
 dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	objset_t *os;
 
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 	bzero(&os->os_zil_header, sizeof (os->os_zil_header));
 	dsl_dataset_dirty(ds, tx);
 }
 
 uint64_t
 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = pdd->dd_pool;
 	uint64_t dsobj, ddobj;
 	dsl_dir_t *dd;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(lastname[0] != '@');
 
 	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
 
 	dsobj = dsl_dataset_create_sync_dd(dd, origin,
 	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
 
 	dsl_deleg_set_create_perms(dd, tx, cr);
 
 	/*
 	 * Since we're creating a new node we know it's a leaf, so we can
 	 * initialize the counts if the limit feature is active.
 	 */
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
 		uint64_t cnt = 0;
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 
 		dsl_dir_zapify(dd, tx);
 		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
 		    sizeof (cnt), 1, &cnt, tx));
 		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
 		    sizeof (cnt), 1, &cnt, tx));
 	}
 
 	dsl_dir_rele(dd, FTAG);
 
 	/*
 	 * If we are creating a clone, make sure we zero out any stale
 	 * data from the origin snapshots zil header.
 	 */
 	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
 		dsl_dataset_t *ds;
 
 		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 		dsl_dataset_zero_zil(ds, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	return (dsobj);
 }
 
 #ifdef __FreeBSD__
 /* FreeBSD ioctl compat begin */
 struct destroyarg {
 	nvlist_t *nvl;
 	const char *snapname;
 };
 
 static int
 dsl_check_snap_cb(const char *name, void *arg)
 {
 	struct destroyarg *da = arg;
 	dsl_dataset_t *ds;
 	char *dsname;
 
 	dsname = kmem_asprintf("%s@%s", name, da->snapname);
 	fnvlist_add_boolean(da->nvl, dsname);
 	kmem_free(dsname, strlen(dsname) + 1);
 
 	return (0);
 }
 
 int
 dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
     nvlist_t *snaps)
 {
 	struct destroyarg *da;
 	int err;
 
 	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
 	da->nvl = snaps;
 	da->snapname = snapname;
 	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
 	    DS_FIND_CHILDREN);
 	kmem_free(da, sizeof (struct destroyarg));
 
 	return (err);
 }
 /* FreeBSD ioctl compat end */
 #endif /* __FreeBSD__ */
 
 /*
  * The unique space in the head dataset can be calculated by subtracting
  * the space used in the most recent snapshot, that is still being used
  * in this file system, from the space currently in use.  To figure out
  * the space in the most recent snapshot still in use, we need to take
  * the total space used in the snapshot and subtract out the space that
  * has been freed up since the snapshot was taken.
  */
 void
 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 {
 	uint64_t mrs_used;
 	uint64_t dlused, dlcomp, dluncomp;
 
 	ASSERT(!dsl_dataset_is_snapshot(ds));
 
-	if (ds->ds_phys->ds_prev_snap_obj != 0)
-		mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
+		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
 	else
 		mrs_used = 0;
 
 	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
 	ASSERT3U(dlused, <=, mrs_used);
-	ds->ds_phys->ds_unique_bytes =
-	    ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
+	dsl_dataset_phys(ds)->ds_unique_bytes =
+	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
 	    SPA_VERSION_UNIQUE_ACCURATE)
-		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
 void
 dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
     dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t count;
 	int err;
 
-	ASSERT(ds->ds_phys->ds_num_children >= 2);
-	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
+	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
+	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+	    obj, tx);
 	/*
 	 * The err should not be ENOENT, but a bug in a previous version
 	 * of the code could cause upgrade_clones_cb() to not set
 	 * ds_next_snap_obj when it should, leading to a missing entry.
 	 * If we knew that the pool was created after
 	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
 	 * ENOENT.  However, at least we can check that we don't have
 	 * too many entries in the next_clones_obj even after failing to
 	 * remove this one.
 	 */
 	if (err != ENOENT)
 		VERIFY0(err);
-	ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 	    &count));
-	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
+	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
 }
 
 
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
-	return (&ds->ds_phys->ds_bp);
+	return (&dsl_dataset_phys(ds)->ds_bp);
 }
 
 void
 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* If it's the meta-objset, set dp_meta_rootbp */
 	if (ds == NULL) {
 		tx->tx_pool->dp_meta_rootbp = *bp;
 	} else {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_bp = *bp;
+		dsl_dataset_phys(ds)->ds_bp = *bp;
 	}
 }
 
 spa_t *
 dsl_dataset_get_spa(dsl_dataset_t *ds)
 {
 	return (ds->ds_dir->dd_pool->dp_spa);
 }
 
 void
 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp;
 
 	if (ds == NULL) /* this is the meta-objset */
 		return;
 
 	ASSERT(ds->ds_objset != NULL);
 
-	if (ds->ds_phys->ds_next_snap_obj != 0)
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
 		panic("dirtying snapshot!");
 
 	dp = ds->ds_dir->dd_pool;
 
 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, ds);
 	}
 }
 
 boolean_t
 dsl_dataset_is_dirty(dsl_dataset_t *ds)
 {
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
 		    ds, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static int
 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	uint64_t asize;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	/*
 	 * If there's an fs-only reservation, any blocks that might become
 	 * owned by the snapshot dataset must be accommodated by space
 	 * outside of the reservation.
 	 */
 	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
-	asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
 	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
 		return (SET_ERROR(ENOSPC));
 
 	/*
 	 * Propagate any reserved space for this snapshot to other
 	 * snapshot checks in this sync group.
 	 */
 	if (asize > 0)
 		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
 
 	return (0);
 }
 
 typedef struct dsl_dataset_snapshot_arg {
 	nvlist_t *ddsa_snaps;
 	nvlist_t *ddsa_props;
 	nvlist_t *ddsa_errors;
 	cred_t *ddsa_cr;
 } dsl_dataset_snapshot_arg_t;
 
 int
 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
 {
 	int error;
 	uint64_t value;
 
 	ds->ds_trysnap_txg = tx->tx_txg;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	/*
 	 * We don't allow multiple snapshots of the same txg.  If there
 	 * is already one, try again.
 	 */
-	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
 		return (SET_ERROR(EAGAIN));
 
 	/*
 	 * Check for conflicting snapshot name.
 	 */
 	error = dsl_dataset_snap_lookup(ds, snapname, &value);
 	if (error == 0)
 		return (SET_ERROR(EEXIST));
 	if (error != ENOENT)
 		return (error);
 
 	/*
 	 * We don't allow taking snapshots of inconsistent datasets, such as
 	 * those into which we are currently receiving.  However, if we are
 	 * creating this snapshot as part of a receive, this check will be
 	 * executed atomically with respect to the completion of the receive
 	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
 	 * case we ignore this, knowing it will be fixed up for us shortly in
 	 * dmu_recv_end_sync().
 	 */
 	if (!recv && DS_IS_INCONSISTENT(ds))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Skip the check for temporary snapshots or if we have already checked
 	 * the counts in dsl_dataset_snapshot_check. This means we really only
 	 * check the count here when we're receiving a stream.
 	 */
 	if (cnt != 0 && cr != NULL) {
 		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
 		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
 		if (error != 0)
 			return (error);
 	}
 
 	error = dsl_dataset_snapshot_reserve_space(ds, tx);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 static int
 dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_arg_t *ddsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 	int rv = 0;
 
 	/*
 	 * Pre-compute how many total new snapshots will be created for each
 	 * level in the tree and below. This is needed for validating the
 	 * snapshot limit when either taking a recursive snapshot or when
 	 * taking multiple snapshots.
 	 *
 	 * The problem is that the counts are not actually adjusted when
 	 * we are checking, only when we finally sync. For a single snapshot,
 	 * this is easy, the count will increase by 1 at each node up the tree,
 	 * but its more complicated for the recursive/multiple snapshot case.
 	 *
 	 * The dsl_fs_ss_limit_check function does recursively check the count
 	 * at each level up the tree but since it is validating each snapshot
 	 * independently we need to be sure that we are validating the complete
 	 * count for the entire set of snapshots. We do this by rolling up the
 	 * counts for each component of the name into an nvlist and then
 	 * checking each of those cases with the aggregated count.
 	 *
 	 * This approach properly handles not only the recursive snapshot
 	 * case (where we get all of those on the ddsa_snaps list) but also
 	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
 	 * validate the limit on 'a' using a count of 2).
 	 *
 	 * We validate the snapshot names in the third loop and only report
 	 * name errors once.
 	 */
 	if (dmu_tx_is_syncing(tx)) {
 		nvlist_t *cnt_track = NULL;
 		cnt_track = fnvlist_alloc();
 
 		/* Rollup aggregated counts into the cnt_track list */
 		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 		    pair != NULL;
 		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 			char *pdelim;
 			uint64_t val;
 			char nm[MAXPATHLEN];
 
 			(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
 			pdelim = strchr(nm, '@');
 			if (pdelim == NULL)
 				continue;
 			*pdelim = '\0';
 
 			do {
 				if (nvlist_lookup_uint64(cnt_track, nm,
 				    &val) == 0) {
 					/* update existing entry */
 					fnvlist_add_uint64(cnt_track, nm,
 					    val + 1);
 				} else {
 					/* add to list */
 					fnvlist_add_uint64(cnt_track, nm, 1);
 				}
 
 				pdelim = strrchr(nm, '/');
 				if (pdelim != NULL)
 					*pdelim = '\0';
 			} while (pdelim != NULL);
 		}
 
 		/* Check aggregated counts at each level */
 		for (pair = nvlist_next_nvpair(cnt_track, NULL);
 		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
 			int error = 0;
 			char *name;
 			uint64_t cnt = 0;
 			dsl_dataset_t *ds;
 
 			name = nvpair_name(pair);
 			cnt = fnvpair_value_uint64(pair);
 			ASSERT(cnt > 0);
 
 			error = dsl_dataset_hold(dp, name, FTAG, &ds);
 			if (error == 0) {
 				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
 				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
 				    ddsa->ddsa_cr);
 				dsl_dataset_rele(ds, FTAG);
 			}
 
 			if (error != 0) {
 				if (ddsa->ddsa_errors != NULL)
 					fnvlist_add_int32(ddsa->ddsa_errors,
 					    name, error);
 				rv = error;
 				/* only report one error for this check */
 				break;
 			}
 		}
 		nvlist_free(cnt_track);
 	}
 
 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		int error = 0;
 		dsl_dataset_t *ds;
 		char *name, *atp;
 		char dsname[MAXNAMELEN];
 
 		name = nvpair_name(pair);
 		if (strlen(name) >= MAXNAMELEN)
 			error = SET_ERROR(ENAMETOOLONG);
 		if (error == 0) {
 			atp = strchr(name, '@');
 			if (atp == NULL)
 				error = SET_ERROR(EINVAL);
 			if (error == 0)
 				(void) strlcpy(dsname, name, atp - name + 1);
 		}
 		if (error == 0)
 			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 		if (error == 0) {
 			/* passing 0/NULL skips dsl_fs_ss_limit_check */
 			error = dsl_dataset_snapshot_check_impl(ds,
 			    atp + 1, tx, B_FALSE, 0, NULL);
 			dsl_dataset_rele(ds, FTAG);
 		}
 
 		if (error != 0) {
 			if (ddsa->ddsa_errors != NULL) {
 				fnvlist_add_int32(ddsa->ddsa_errors,
 				    name, error);
 			}
 			rv = error;
 		}
 	}
 
 	return (rv);
 }
 
 void
 dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx)
 {
 	static zil_header_t zero_zil;
 
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, crtxg;
 	objset_t *mos = dp->dp_meta_objset;
 	objset_t *os;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	/*
 	 * If we are on an old pool, the zil must not be active, in which
 	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
 	 */
 	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
 	    dmu_objset_from_ds(ds, &os) != 0 ||
 	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
 	    sizeof (zero_zil)) == 0);
 
 	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
 
 	/*
 	 * The origin's ds_creation_txg has to be < TXG_INITIAL
 	 */
 	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
 		crtxg = 1;
 	else
 		crtxg = tx->tx_txg;
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	do {
 		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 		    sizeof (dsphys->ds_guid));
 	} while (dsphys->ds_guid == 0);
-	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
-	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = crtxg;
-	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-	dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
-	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
-	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
-	dsphys->ds_flags = ds->ds_phys->ds_flags;
-	dsphys->ds_bp = ds->ds_phys->ds_bp;
+	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
+	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
+	dsphys->ds_uncompressed_bytes =
+	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
+	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
 	dmu_buf_rele(dbuf, FTAG);
 
 	if (ds->ds_large_blocks)
 		dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
 
-	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
+	ASSERT3U(ds->ds_prev != 0, ==,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
 		uint64_t next_clones_obj =
-		    ds->ds_prev->ds_phys->ds_next_clones_obj;
-		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
+		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object ||
-		    ds->ds_prev->ds_phys->ds_num_children > 1);
-		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+		    ds->ds_object) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-			    ds->ds_prev->ds_phys->ds_creation_txg);
-			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
+			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
 		} else if (next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    dsphys->ds_next_snap_obj, tx);
 			VERIFY0(zap_add_int(mos,
 			    next_clones_obj, dsobj, tx));
 		}
 	}
 
 	/*
 	 * If we have a reference-reservation on this dataset, we will
 	 * need to increase the amount of refreservation being charged
 	 * since our unique space is going to zero.
 	 */
 	if (ds->ds_reserved) {
 		int64_t delta;
 		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
-		delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
+		    ds->ds_reserved);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
 		    delta, 0, 0, tx);
 	}
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
-	    UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
 	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
 	dsl_deadlist_add_key(&ds->ds_deadlist,
-	    ds->ds_phys->ds_prev_snap_txg, tx);
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 
-	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
-	ds->ds_phys->ds_prev_snap_obj = dsobj;
-	ds->ds_phys->ds_prev_snap_txg = crtxg;
-	ds->ds_phys->ds_unique_bytes = 0;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
+	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
+	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
+	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
-		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
-	VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 	    snapname, 8, 1, &dsobj, tx));
 
 	if (ds->ds_prev)
 		dsl_dataset_rele(ds->ds_prev, ds);
 	VERIFY0(dsl_dataset_hold_obj(dp,
-	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
 
 	dsl_scan_ds_snapshotted(ds, tx);
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
 	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
 }
 
 static void
 dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_arg_t *ddsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 
 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		dsl_dataset_t *ds;
 		char *name, *atp;
 		char dsname[MAXNAMELEN];
 
 		name = nvpair_name(pair);
 		atp = strchr(name, '@');
 		(void) strlcpy(dsname, name, atp - name + 1);
 		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
 
 		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
 		if (ddsa->ddsa_props != NULL) {
 			dsl_props_set_sync_impl(ds->ds_prev,
 			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
 		}
 		dsl_dataset_rele(ds, FTAG);
 	}
 }
 
 /*
  * The snapshots must all be in the same pool.
  * All-or-nothing: if there are any failures, nothing will be modified.
  */
 int
 dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 {
 	dsl_dataset_snapshot_arg_t ddsa;
 	nvpair_t *pair;
 	boolean_t needsuspend;
 	int error;
 	spa_t *spa;
 	char *firstname;
 	nvlist_t *suspended = NULL;
 
 	pair = nvlist_next_nvpair(snaps, NULL);
 	if (pair == NULL)
 		return (0);
 	firstname = nvpair_name(pair);
 
 	error = spa_open(firstname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
 	spa_close(spa, FTAG);
 
 	if (needsuspend) {
 		suspended = fnvlist_alloc();
 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(snaps, pair)) {
 			char fsname[MAXNAMELEN];
 			char *snapname = nvpair_name(pair);
 			char *atp;
 			void *cookie;
 
 			atp = strchr(snapname, '@');
 			if (atp == NULL) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 			(void) strlcpy(fsname, snapname, atp - snapname + 1);
 
 			error = zil_suspend(fsname, &cookie);
 			if (error != 0)
 				break;
 			fnvlist_add_uint64(suspended, fsname,
 			    (uintptr_t)cookie);
 		}
 	}
 
 	ddsa.ddsa_snaps = snaps;
 	ddsa.ddsa_props = props;
 	ddsa.ddsa_errors = errors;
 	ddsa.ddsa_cr = CRED();
 
 	if (error == 0) {
 		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
 		    dsl_dataset_snapshot_sync, &ddsa,
 		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
 	}
 
 	if (suspended != NULL) {
 		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(suspended, pair)) {
 			zil_resume((void *)(uintptr_t)
 			    fnvpair_value_uint64(pair));
 		}
 		fnvlist_free(suspended);
 	}
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	if (error == 0) {
 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(snaps, pair)) {
 			char *snapname = nvpair_name(pair);
 			zvol_create_minors(snapname);
 		}
 	}
 #endif
 #endif
 	return (error);
 }
 
 typedef struct dsl_dataset_snapshot_tmp_arg {
 	const char *ddsta_fsname;
 	const char *ddsta_snapname;
 	minor_t ddsta_cleanup_minor;
 	const char *ddsta_htag;
 } dsl_dataset_snapshot_tmp_arg_t;
 
 static int
 dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	/* NULL cred means no limit check for tmp snapshot */
 	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
 	    tx, B_FALSE, 0, NULL);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
 	    B_TRUE, tx);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
 
 	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
 	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
 	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
 	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
 
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
     minor_t cleanup_minor, const char *htag)
 {
 	dsl_dataset_snapshot_tmp_arg_t ddsta;
 	int error;
 	spa_t *spa;
 	boolean_t needsuspend;
 	void *cookie;
 
 	ddsta.ddsta_fsname = fsname;
 	ddsta.ddsta_snapname = snapname;
 	ddsta.ddsta_cleanup_minor = cleanup_minor;
 	ddsta.ddsta_htag = htag;
 
 	error = spa_open(fsname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
 	spa_close(spa, FTAG);
 
 	if (needsuspend) {
 		error = zil_suspend(fsname, &cookie);
 		if (error != 0)
 			return (error);
 	}
 
 	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
 	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
 
 	if (needsuspend)
 		zil_resume(cookie);
 	return (error);
 }
 
 
 void
 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_objset != NULL);
-	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
 
 	/*
 	 * in case we had to change ds_fsid_guid when we opened it,
 	 * sync it out now.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
 
 	dmu_objset_sync(ds->ds_objset, zio, tx);
 
 	if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
 		dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
 		ds->ds_large_blocks = B_TRUE;
 	}
 }
 
 static void
 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	uint64_t count = 0;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	nvlist_t *propval = fnvlist_alloc();
 	nvlist_t *val = fnvlist_alloc();
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 
 	/*
 	 * There may be missing entries in ds_next_clones_obj
 	 * due to a bug in a previous version of the code.
 	 * Only trust it if it has the right number of entries.
 	 */
-	if (ds->ds_phys->ds_next_clones_obj != 0) {
-		VERIFY0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 		    &count));
 	}
-	if (count != ds->ds_phys->ds_num_children - 1)
+	if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
 		goto fail;
-	for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
+	for (zap_cursor_init(&zc, mos,
+	    dsl_dataset_phys(ds)->ds_next_clones_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
 		char buf[ZFS_MAXNAMELEN];
 		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 		    za.za_first_integer, FTAG, &clone));
 		dsl_dir_name(clone->ds_dir, buf);
 		fnvlist_add_boolean(val, buf);
 		dsl_dataset_rele(clone, FTAG);
 	}
 	zap_cursor_fini(&zc);
 	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
 	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
 fail:
 	nvlist_free(val);
 	nvlist_free(propval);
 }
 
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	uint64_t refd, avail, uobjs, aobjs, ratio;
 
 	ASSERT(dsl_pool_config_held(dp));
 
-	ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
-	    (ds->ds_phys->ds_uncompressed_bytes * 100 /
-	    ds->ds_phys->ds_compressed_bytes);
+	ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
+	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
+	    dsl_dataset_phys(ds)->ds_compressed_bytes);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
-	    ds->ds_phys->ds_uncompressed_bytes);
+	    dsl_dataset_phys(ds)->ds_uncompressed_bytes);
 
 	if (dsl_dataset_is_snapshot(ds)) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-		    ds->ds_phys->ds_unique_bytes);
+		    dsl_dataset_phys(ds)->ds_unique_bytes);
 		get_clones_stat(ds, nv);
 	} else {
 		if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
 			char buf[MAXNAMELEN];
 			dsl_dataset_name(ds->ds_prev, buf);
 			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
 		}
 
 		dsl_dir_stats(ds->ds_dir, nv);
 	}
 
 	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
-	    ds->ds_phys->ds_creation_time);
+	    dsl_dataset_phys(ds)->ds_creation_time);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
-	    ds->ds_phys->ds_creation_txg);
+	    dsl_dataset_phys(ds)->ds_creation_txg);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
 	    ds->ds_quota);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
 	    ds->ds_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
-	    ds->ds_phys->ds_guid);
+	    dsl_dataset_phys(ds)->ds_guid);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
-	    ds->ds_phys->ds_unique_bytes);
+	    dsl_dataset_phys(ds)->ds_unique_bytes);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
 	    ds->ds_object);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
 	    ds->ds_userrefs);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
 	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 
-	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		uint64_t written, comp, uncomp;
 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
 		dsl_dataset_t *prev;
 
 		int err = dsl_dataset_hold_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err == 0) {
 			err = dsl_dataset_space_written(prev, ds, &written,
 			    &comp, &uncomp);
 			dsl_dataset_rele(prev, FTAG);
 			if (err == 0) {
 				dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
 				    written);
 			}
 		}
 	}
 }
 
 void
 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	ASSERT(dsl_pool_config_held(dp));
 
-	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
-	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
-	stat->dds_guid = ds->ds_phys->ds_guid;
+	stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
+	stat->dds_inconsistent =
+	    dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
+	stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
 	stat->dds_origin[0] = '\0';
 	if (dsl_dataset_is_snapshot(ds)) {
 		stat->dds_is_snapshot = B_TRUE;
-		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+		stat->dds_num_clones =
+		    dsl_dataset_phys(ds)->ds_num_children - 1;
 	} else {
 		stat->dds_is_snapshot = B_FALSE;
 		stat->dds_num_clones = 0;
 
 		if (dsl_dir_is_clone(ds->ds_dir)) {
 			dsl_dataset_t *ods;
 
 			VERIFY0(dsl_dataset_hold_obj(dp,
-			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
+			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
+			    FTAG, &ods));
 			dsl_dataset_name(ods, stat->dds_origin);
 			dsl_dataset_rele(ods, FTAG);
 		}
 	}
 }
 
 uint64_t
 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
 {
 	return (ds->ds_fsid_guid);
 }
 
 void
 dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
-	*refdbytesp = ds->ds_phys->ds_referenced_bytes;
+	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
-	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
-		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
+		*availbytesp +=
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
 	if (ds->ds_quota != 0) {
 		/*
 		 * Adjust available bytes according to refquota
 		 */
 		if (*refdbytesp < ds->ds_quota)
 			*availbytesp = MIN(*availbytesp,
 			    ds->ds_quota - *refdbytesp);
 		else
 			*availbytesp = 0;
 	}
-	*usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp);
+	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
 boolean_t
 dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	ASSERT(dsl_pool_config_held(dp));
 	if (snap == NULL)
 		return (B_FALSE);
-	if (ds->ds_phys->ds_bp.blk_birth >
-	    snap->ds_phys->ds_creation_txg) {
+	if (dsl_dataset_phys(ds)->ds_bp.blk_birth >
+	    dsl_dataset_phys(snap)->ds_creation_txg) {
 		objset_t *os, *os_snap;
 		/*
 		 * It may be that only the ZIL differs, because it was
 		 * reset in the head.  Don't count that as being
 		 * modified.
 		 */
 		if (dmu_objset_from_ds(ds, &os) != 0)
 			return (B_TRUE);
 		if (dmu_objset_from_ds(snap, &os_snap) != 0)
 			return (B_TRUE);
 		return (bcmp(&os->os_phys->os_meta_dnode,
 		    &os_snap->os_phys->os_meta_dnode,
 		    sizeof (os->os_phys->os_meta_dnode)) != 0);
 	}
 	return (B_FALSE);
 }
 
 typedef struct dsl_dataset_rename_snapshot_arg {
 	const char *ddrsa_fsname;
 	const char *ddrsa_oldsnapname;
 	const char *ddrsa_newsnapname;
 	boolean_t ddrsa_recursive;
 	dmu_tx_t *ddrsa_tx;
 } dsl_dataset_rename_snapshot_arg_t;
 
 /* ARGSUSED */
 static int
 dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
     dsl_dataset_t *hds, void *arg)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	int error;
 	uint64_t val;
 
 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
 	if (error != 0) {
 		/* ignore nonexistent snapshots */
 		return (error == ENOENT ? 0 : error);
 	}
 
 	/* new name should not exist */
 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
 	if (error == 0)
 		error = SET_ERROR(EEXIST);
 	else if (error == ENOENT)
 		error = 0;
 
 	/* dataset name + 1 for the "@" + the new snapshot name must fit */
 	if (dsl_dir_namelen(hds->ds_dir) + 1 +
 	    strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
 		error = SET_ERROR(ENAMETOOLONG);
 
 	return (error);
 }
 
 static int
 dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
 	if (error != 0)
 		return (error);
 
 	if (ddrsa->ddrsa_recursive) {
 		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
 		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
 		    DS_FIND_CHILDREN);
 	} else {
 		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
 	}
 	dsl_dataset_rele(hds, FTAG);
 	return (error);
 }
 
 static int
 dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
     dsl_dataset_t *hds, void *arg)
 {
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	char *oldname, *newname;
 #endif
 #endif
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	dsl_dataset_t *ds;
 	uint64_t val;
 	dmu_tx_t *tx = ddrsa->ddrsa_tx;
 	int error;
 
 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
 	ASSERT(error == 0 || error == ENOENT);
 	if (error == ENOENT) {
 		/* ignore nonexistent snapshots */
 		return (0);
 	}
 
 	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
 
 	/* log before we change the name */
 	spa_history_log_internal_ds(ds, "rename", tx,
 	    "-> @%s", ddrsa->ddrsa_newsnapname);
 
 	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
 	    B_FALSE));
 	mutex_enter(&ds->ds_lock);
 	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
 	mutex_exit(&ds->ds_lock);
-	VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj,
+	VERIFY0(zap_add(dp->dp_meta_objset,
+	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
 	    ddrsa->ddrsa_oldsnapname);
 	snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
 	    ddrsa->ddrsa_newsnapname);
 	zfsvfs_update_fromname(oldname, newname);
 	zvol_rename_minors(oldname, newname);
 	kmem_free(newname, MAXPATHLEN);
 	kmem_free(oldname, MAXPATHLEN);
 #endif
 #endif
 	dsl_dataset_rele(ds, FTAG);
 
 	return (0);
 }
 
 static void
 dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
 	ddrsa->ddrsa_tx = tx;
 	if (ddrsa->ddrsa_recursive) {
 		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
 		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
 		    DS_FIND_CHILDREN));
 	} else {
 		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
 	}
 	dsl_dataset_rele(hds, FTAG);
 }
 
 int
 dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive)
 {
 	dsl_dataset_rename_snapshot_arg_t ddrsa;
 
 	ddrsa.ddrsa_fsname = fsname;
 	ddrsa.ddrsa_oldsnapname = oldsnapname;
 	ddrsa.ddrsa_newsnapname = newsnapname;
 	ddrsa.ddrsa_recursive = recursive;
 
 	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
 	    dsl_dataset_rename_snapshot_sync, &ddrsa,
 	    1, ZFS_SPACE_CHECK_RESERVED));
 }
 
 /*
  * If we're doing an ownership handoff, we need to make sure that there is
  * only one long hold on the dataset.  We're not allowed to change anything here
  * so we don't permanently release the long hold or regular hold here.  We want
  * to do this only when syncing to avoid the dataset unexpectedly going away
  * when we release the long hold.
  */
 static int
 dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
 {
 	boolean_t held;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	if (owner != NULL) {
 		VERIFY3P(ds->ds_owner, ==, owner);
 		dsl_dataset_long_rele(ds, owner);
 	}
 
 	held = dsl_dataset_long_held(ds);
 
 	if (owner != NULL)
 		dsl_dataset_long_hold(ds, owner);
 
 	if (held)
 		return (SET_ERROR(EBUSY));
 
 	return (0);
 }
 
 typedef struct dsl_dataset_rollback_arg {
 	const char *ddra_fsname;
 	void *ddra_owner;
 	nvlist_t *ddra_result;
 } dsl_dataset_rollback_arg_t;
 
 static int
 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int64_t unused_refres_delta;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	/* must not be a snapshot */
 	if (dsl_dataset_is_snapshot(ds)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* must have a most recent snapshot */
-	if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* must not have any bookmarks after the most recent snapshot */
 	nvlist_t *proprequest = fnvlist_alloc();
 	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
 	nvlist_t *bookmarks = fnvlist_alloc();
 	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
 	fnvlist_free(proprequest);
 	if (error != 0)
 		return (error);
 	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
 		nvlist_t *valuenv =
 		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
 		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
 		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
-		if (createtxg > ds->ds_phys->ds_prev_snap_txg) {
+		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 			fnvlist_free(bookmarks);
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EEXIST));
 		}
 	}
 	fnvlist_free(bookmarks);
 
 	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Check if the snap we are rolling back to uses more than
 	 * the refquota.
 	 */
 	if (ds->ds_quota != 0 &&
-	    ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) {
+	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * When we do the clone swap, we will temporarily use more space
 	 * due to the refreservation (the head will no longer have any
 	 * unique space, so the entire amount of the refreservation will need
 	 * to be free).  We will immediately destroy the clone, freeing
 	 * this space, but the freeing happens over many txg's.
 	 */
 	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
-	    ds->ds_phys->ds_unique_bytes);
+	    dsl_dataset_phys(ds)->ds_unique_bytes);
 
 	if (unused_refres_delta > 0 &&
 	    unused_refres_delta >
 	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds, *clone;
 	uint64_t cloneobj;
 	char namebuf[ZFS_MAXNAMELEN];
 
 	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
 
 	dsl_dataset_name(ds->ds_prev, namebuf);
 	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
 
 	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
 	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
 
 	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
 	dsl_dataset_zero_zil(ds, tx);
 
 	dsl_destroy_head_sync_impl(clone, tx);
 
 	dsl_dataset_rele(clone, FTAG);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * Rolls back the given filesystem or volume to the most recent snapshot.
  * The name of the most recent snapshot will be returned under key "target"
  * in the result nvlist.
  *
  * If owner != NULL:
  * - The existing dataset MUST be owned by the specified owner at entry
  * - Upon return, dataset will still be held by the same owner, whether we
  *   succeed or not.
  *
  * This mode is required any time the existing filesystem is mounted.  See
  * notes above zfs_suspend_fs() for further details.
  */
 int
 dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
 {
 	dsl_dataset_rollback_arg_t ddra;
 
 	ddra.ddra_fsname = fsname;
 	ddra.ddra_owner = owner;
 	ddra.ddra_result = result;
 
 	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
 	    dsl_dataset_rollback_sync, &ddra,
 	    1, ZFS_SPACE_CHECK_RESERVED));
 }
 
 struct promotenode {
 	list_node_t link;
 	dsl_dataset_t *ds;
 };
 
 typedef struct dsl_dataset_promote_arg {
 	const char *ddpa_clonename;
 	dsl_dataset_t *ddpa_clone;
 	list_t shared_snaps, origin_snaps, clone_snaps;
 	dsl_dataset_t *origin_origin; /* origin of the origin */
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
 	char *err_ds;
 	cred_t *cr;
 } dsl_dataset_promote_arg_t;
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
 static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
     void *tag);
 static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
 
 static int
 dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_promote_arg_t *ddpa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	struct promotenode *snap;
 	dsl_dataset_t *origin_ds;
 	int err;
 	uint64_t unused;
 	uint64_t ss_mv_cnt;
 
 	err = promote_hold(ddpa, dp, FTAG);
 	if (err != 0)
 		return (err);
 
 	hds = ddpa->ddpa_clone;
 
-	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
+	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
 		promote_rele(ddpa, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/*
 	 * Compute and check the amount of space to transfer.  Since this is
 	 * so expensive, don't do the preliminary check.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		promote_rele(ddpa, FTAG);
 		return (0);
 	}
 
 	snap = list_head(&ddpa->shared_snaps);
 	origin_ds = snap->ds;
 
 	/* compute origin's new unique space */
 	snap = list_tail(&ddpa->clone_snaps);
-	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
 	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
-	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
 	    &ddpa->unique, &unused, &unused);
 
 	/*
 	 * Walk the snapshots that we are moving
 	 *
 	 * Compute space to transfer.  Consider the incremental changes
 	 * to used by each snapshot:
 	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
 	 * So each snapshot gave birth to:
 	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
 	 * So a sequence would look like:
 	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
 	 * Which simplifies to:
 	 * uN + kN + kN-1 + ... + k1 + k0
 	 * Note however, if we stop before we reach the ORIGIN we get:
 	 * uN + kN + kN-1 + ... + kM - uM-1
 	 */
 	ss_mv_cnt = 0;
-	ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
-	ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
-	ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
+	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
+	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
+	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
 	for (snap = list_head(&ddpa->shared_snaps); snap;
 	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		uint64_t val, dlused, dlcomp, dluncomp;
 		dsl_dataset_t *ds = snap->ds;
 
 		ss_mv_cnt++;
 
 		/*
 		 * If there are long holds, we won't be able to evict
 		 * the objset.
 		 */
 		if (dsl_dataset_long_held(ds)) {
 			err = SET_ERROR(EBUSY);
 			goto out;
 		}
 
 		/* Check that the snapshot name does not conflict */
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
 		if (err == 0) {
 			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
 			err = SET_ERROR(EEXIST);
 			goto out;
 		}
 		if (err != ENOENT)
 			goto out;
 
 		/* The very first snapshot does not have a deadlist */
-		if (ds->ds_phys->ds_prev_snap_obj == 0)
+		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
 			continue;
 
 		dsl_deadlist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp);
 		ddpa->used += dlused;
 		ddpa->comp += dlcomp;
 		ddpa->uncomp += dluncomp;
 	}
 
 	/*
 	 * If we are a clone of a clone then we never reached ORIGIN,
 	 * so we need to subtract out the clone origin's used space.
 	 */
 	if (ddpa->origin_origin) {
-		ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes;
-		ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes;
+		ddpa->used -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
+		ddpa->comp -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
 		ddpa->uncomp -=
-		    ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
+		    dsl_dataset_phys(ddpa->origin_origin)->
+		    ds_uncompressed_bytes;
 	}
 
 	/* Check that there is enough space and limit headroom here */
 	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
 	    0, ss_mv_cnt, ddpa->used, ddpa->cr);
 	if (err != 0)
 		goto out;
 
 	/*
 	 * Compute the amounts of space that will be used by snapshots
 	 * after the promotion (for both origin and clone).  For each,
 	 * it is the amount of space that will be on all of their
 	 * deadlists (that was not born before their new origin).
 	 */
-	if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		uint64_t space;
 
 		/*
 		 * Note, typically this will not be a clone of a clone,
 		 * so dd_origin_txg will be < TXG_INITIAL, so
 		 * these snaplist_space() -> dsl_deadlist_space_range()
 		 * calls will be fast because they do not have to
 		 * iterate over all bps.
 		 */
 		snap = list_head(&ddpa->origin_snaps);
 		err = snaplist_space(&ddpa->shared_snaps,
 		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
 		if (err != 0)
 			goto out;
 
 		err = snaplist_space(&ddpa->clone_snaps,
 		    snap->ds->ds_dir->dd_origin_txg, &space);
 		if (err != 0)
 			goto out;
 		ddpa->cloneusedsnap += space;
 	}
-	if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
+	    DD_FLAG_USED_BREAKDOWN) {
 		err = snaplist_space(&ddpa->origin_snaps,
-		    origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap);
+		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
+		    &ddpa->originusedsnap);
 		if (err != 0)
 			goto out;
 	}
 
 out:
 	promote_rele(ddpa, FTAG);
 	return (err);
 }
 
 static void
 dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_promote_arg_t *ddpa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	struct promotenode *snap;
 	dsl_dataset_t *origin_ds;
 	dsl_dataset_t *origin_head;
 	dsl_dir_t *dd;
 	dsl_dir_t *odd = NULL;
 	uint64_t oldnext_obj;
 	int64_t delta;
 #if defined(__FreeBSD__) && defined(_KERNEL)
 	char *oldname, *newname;
 #endif
 
 	VERIFY0(promote_hold(ddpa, dp, FTAG));
 	hds = ddpa->ddpa_clone;
 
-	ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE);
+	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
 
 	snap = list_head(&ddpa->shared_snaps);
 	origin_ds = snap->ds;
 	dd = hds->ds_dir;
 
 	snap = list_head(&ddpa->origin_snaps);
 	origin_head = snap->ds;
 
 	/*
 	 * We need to explicitly open odd, since origin_ds's dd will be
 	 * changing.
 	 */
 	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
 	    NULL, FTAG, &odd));
 
 	/* change origin's next snap */
 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
-	oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
+	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
 	snap = list_tail(&ddpa->clone_snaps);
-	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
-	origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
+	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
 
 	/* change the origin's next clone */
-	if (origin_ds->ds_phys->ds_next_clones_obj) {
+	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
 		dsl_dataset_remove_from_next_clones(origin_ds,
 		    snap->ds->ds_object, tx);
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    origin_ds->ds_phys->ds_next_clones_obj,
+		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
 		    oldnext_obj, tx));
 	}
 
 	/* change origin */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
-	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
+	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
+	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
 	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
 	dmu_buf_will_dirty(odd->dd_dbuf, tx);
-	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
+	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
 	origin_head->ds_dir->dd_origin_txg =
-	    origin_ds->ds_phys->ds_creation_txg;
+	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
 
 	/* change dd_clone entries */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
-		    odd->dd_phys->dd_clones, hds->ds_object, tx));
+		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
 		    hds->ds_object, tx));
 
 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
-		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
 		    origin_head->ds_object, tx));
-		if (dd->dd_phys->dd_clones == 0) {
-			dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
-			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+		if (dsl_dir_phys(dd)->dd_clones == 0) {
+			dsl_dir_phys(dd)->dd_clones =
+			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
+			    DMU_OT_NONE, 0, tx);
 		}
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    dd->dd_phys->dd_clones, origin_head->ds_object, tx));
+		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
 	}
 
 #if defined(__FreeBSD__) && defined(_KERNEL)
 	/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
 	mutex_enter(&spa_namespace_lock);
 
 	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 #endif
 
 	/* move snapshots to this dir */
 	for (snap = list_head(&ddpa->shared_snaps); snap;
 	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		dsl_dataset_t *ds = snap->ds;
 
 		/*
 		 * Property callbacks are registered to a particular
 		 * dsl_dir.  Since ours is changing, evict the objset
 		 * so that they will be unregistered from the old dsl_dir.
 		 */
 		if (ds->ds_objset) {
 			dmu_objset_evict(ds->ds_objset);
 			ds->ds_objset = NULL;
 		}
 
 		/* move snap name entry */
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		VERIFY0(dsl_dataset_snap_remove(origin_head,
 		    ds->ds_snapname, tx, B_TRUE));
 		VERIFY0(zap_add(dp->dp_meta_objset,
-		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
 		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
-		ds->ds_phys->ds_dir_obj = dd->dd_object;
+		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
+		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
 		ASSERT3P(ds->ds_dir, ==, odd);
 		dsl_dir_rele(ds->ds_dir, ds);
 		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
 #if defined(__FreeBSD__) && defined(_KERNEL)
 		dsl_dataset_name(ds, newname);
 		zfsvfs_update_fromname(oldname, newname);
 		zvol_rename_minors(oldname, newname);
 #endif
 
 		/* move any clone references */
-		if (ds->ds_phys->ds_next_clones_obj &&
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
 		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			zap_cursor_t zc;
 			zap_attribute_t za;
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj);
+			    dsl_dataset_phys(ds)->ds_next_clones_obj);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
 				dsl_dataset_t *cnds;
 				uint64_t o;
 
 				if (za.za_first_integer == oldnext_obj) {
 					/*
 					 * We've already moved the
 					 * origin's reference.
 					 */
 					continue;
 				}
 
 				VERIFY0(dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &cnds));
-				o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
+				o = dsl_dir_phys(cnds->ds_dir)->
+				    dd_head_dataset_obj;
 
 				VERIFY0(zap_remove_int(dp->dp_meta_objset,
-				    odd->dd_phys->dd_clones, o, tx));
+				    dsl_dir_phys(odd)->dd_clones, o, tx));
 				VERIFY0(zap_add_int(dp->dp_meta_objset,
-				    dd->dd_phys->dd_clones, o, tx));
+				    dsl_dir_phys(dd)->dd_clones, o, tx));
 				dsl_dataset_rele(cnds, FTAG);
 			}
 			zap_cursor_fini(&zc);
 		}
 
 		ASSERT(!dsl_prop_hascb(ds));
 	}
 
 #if defined(__FreeBSD__) && defined(_KERNEL)
 	mutex_exit(&spa_namespace_lock);
 
 	kmem_free(newname, MAXPATHLEN);
 	kmem_free(oldname, MAXPATHLEN);
 #endif
 	/*
 	 * Change space accounting.
 	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
 	 * both be valid, or both be 0 (resulting in delta == 0).  This
 	 * is true for each of {clone,origin} independently.
 	 */
 
 	delta = ddpa->cloneusedsnap -
-	    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, >=, 0);
 	ASSERT3U(ddpa->used, >=, delta);
 	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(dd, DD_USED_HEAD,
 	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
 
 	delta = ddpa->originusedsnap -
-	    odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, <=, 0);
 	ASSERT3U(ddpa->used, >=, -delta);
 	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(odd, DD_USED_HEAD,
 	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
 
-	origin_ds->ds_phys->ds_unique_bytes = ddpa->unique;
+	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
 
 	/* log history record */
 	spa_history_log_internal_ds(hds, "promote", tx, "");
 
 	dsl_dir_rele(odd, FTAG);
 	promote_rele(ddpa, FTAG);
 }
 
 /*
  * Make a list of dsl_dataset_t's for the snapshots between first_obj
  * (exclusive) and last_obj (inclusive).  The list will be in reverse
  * order (last_obj will be the list_head()).  If first_obj == 0, do all
  * snapshots back to this dataset's origin.
  */
 static int
 snaplist_make(dsl_pool_t *dp,
     uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
 {
 	uint64_t obj = last_obj;
 
 	list_create(l, sizeof (struct promotenode),
 	    offsetof(struct promotenode, link));
 
 	while (obj != first_obj) {
 		dsl_dataset_t *ds;
 		struct promotenode *snap;
 		int err;
 
 		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
 		ASSERT(err != ENOENT);
 		if (err != 0)
 			return (err);
 
 		if (first_obj == 0)
-			first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
+			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
 
 		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
 		snap->ds = ds;
 		list_insert_tail(l, snap);
-		obj = ds->ds_phys->ds_prev_snap_obj;
+		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	}
 
 	return (0);
 }
 
 static int
 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
 {
 	struct promotenode *snap;
 
 	*spacep = 0;
 	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
 		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
 		*spacep += used;
 	}
 	return (0);
 }
 
 static void
 snaplist_destroy(list_t *l, void *tag)
 {
 	struct promotenode *snap;
 
 	if (l == NULL || !list_link_active(&l->list_head))
 		return;
 
 	while ((snap = list_tail(l)) != NULL) {
 		list_remove(l, snap);
 		dsl_dataset_rele(snap->ds, tag);
 		kmem_free(snap, sizeof (*snap));
 	}
 	list_destroy(l);
 }
 
 static int
 promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
 {
 	int error;
 	dsl_dir_t *dd;
 	struct promotenode *snap;
 
 	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
 	    &ddpa->ddpa_clone);
 	if (error != 0)
 		return (error);
 	dd = ddpa->ddpa_clone->ds_dir;
 
 	if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
 	    !dsl_dir_is_clone(dd)) {
 		dsl_dataset_rele(ddpa->ddpa_clone, tag);
 		return (SET_ERROR(EINVAL));
 	}
 
-	error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj,
+	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
 	    &ddpa->shared_snaps, tag);
 	if (error != 0)
 		goto out;
 
 	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
 	    &ddpa->clone_snaps, tag);
 	if (error != 0)
 		goto out;
 
 	snap = list_head(&ddpa->shared_snaps);
-	ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
-	error = snaplist_make(dp, dd->dd_phys->dd_origin_obj,
-	    snap->ds->ds_dir->dd_phys->dd_head_dataset_obj,
+	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
+	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
+	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
 	    &ddpa->origin_snaps, tag);
 	if (error != 0)
 		goto out;
 
-	if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
 		error = dsl_dataset_hold_obj(dp,
-		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
+		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
 		    tag, &ddpa->origin_origin);
 		if (error != 0)
 			goto out;
 	}
 out:
 	if (error != 0)
 		promote_rele(ddpa, tag);
 	return (error);
 }
 
 static void
 promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
 {
 	snaplist_destroy(&ddpa->shared_snaps, tag);
 	snaplist_destroy(&ddpa->clone_snaps, tag);
 	snaplist_destroy(&ddpa->origin_snaps, tag);
 	if (ddpa->origin_origin != NULL)
 		dsl_dataset_rele(ddpa->origin_origin, tag);
 	dsl_dataset_rele(ddpa->ddpa_clone, tag);
 }
 
 /*
  * Promote a clone.
  *
  * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
  * in with the name.  (It must be at least MAXNAMELEN bytes long.)
  */
 int
 dsl_dataset_promote(const char *name, char *conflsnap)
 {
 	dsl_dataset_promote_arg_t ddpa = { 0 };
 	uint64_t numsnaps;
 	int error;
 	objset_t *os;
 
 	/*
 	 * We will modify space proportional to the number of
 	 * snapshots.  Compute numsnaps.
 	 */
 	error = dmu_objset_hold(name, FTAG, &os);
 	if (error != 0)
 		return (error);
 	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
-	    dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps);
+	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
+	    &numsnaps);
 	dmu_objset_rele(os, FTAG);
 	if (error != 0)
 		return (error);
 
 	ddpa.ddpa_clonename = name;
 	ddpa.err_ds = conflsnap;
 	ddpa.cr = CRED();
 
 	return (dsl_sync_task(name, dsl_dataset_promote_check,
 	    dsl_dataset_promote_sync, &ddpa,
 	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
 {
 	int64_t unused_refres_delta;
 
 	/* they should both be heads */
 	if (dsl_dataset_is_snapshot(clone) ||
 	    dsl_dataset_is_snapshot(origin_head))
 		return (SET_ERROR(EINVAL));
 
 	/* if we are not forcing, the branch point should be just before them */
 	if (!force && clone->ds_prev != origin_head->ds_prev)
 		return (SET_ERROR(EINVAL));
 
 	/* clone should be the clone (unless they are unrelated) */
 	if (clone->ds_prev != NULL &&
 	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
 	    origin_head->ds_dir != clone->ds_prev->ds_dir)
 		return (SET_ERROR(EINVAL));
 
 	/* the clone should be a child of the origin */
 	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
 		return (SET_ERROR(EINVAL));
 
 	/* origin_head shouldn't be modified unless 'force' */
 	if (!force &&
 	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
 		return (SET_ERROR(ETXTBSY));
 
 	/* origin_head should have no long holds (e.g. is not mounted) */
 	if (dsl_dataset_handoff_check(origin_head, owner, tx))
 		return (SET_ERROR(EBUSY));
 
 	/* check amount of any unconsumed refreservation */
 	unused_refres_delta =
 	    (int64_t)MIN(origin_head->ds_reserved,
-	    origin_head->ds_phys->ds_unique_bytes) -
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
 	    (int64_t)MIN(origin_head->ds_reserved,
-	    clone->ds_phys->ds_unique_bytes);
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	if (unused_refres_delta > 0 &&
 	    unused_refres_delta >
 	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
 		return (SET_ERROR(ENOSPC));
 
 	/* clone can't be over the head's refquota */
 	if (origin_head->ds_quota != 0 &&
-	    clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota)
+	    dsl_dataset_phys(clone)->ds_referenced_bytes >
+	    origin_head->ds_quota)
 		return (SET_ERROR(EDQUOT));
 
 	return (0);
 }
 
 void
 dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int64_t unused_refres_delta;
 
 	ASSERT(clone->ds_reserved == 0);
 	ASSERT(origin_head->ds_quota == 0 ||
-	    clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota);
+	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
 	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
 
 	dmu_buf_will_dirty(clone->ds_dbuf, tx);
 	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 
 	if (clone->ds_objset != NULL) {
 		dmu_objset_evict(clone->ds_objset);
 		clone->ds_objset = NULL;
 	}
 
 	if (origin_head->ds_objset != NULL) {
 		dmu_objset_evict(origin_head->ds_objset);
 		origin_head->ds_objset = NULL;
 	}
 
 	unused_refres_delta =
 	    (int64_t)MIN(origin_head->ds_reserved,
-	    origin_head->ds_phys->ds_unique_bytes) -
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
 	    (int64_t)MIN(origin_head->ds_reserved,
-	    clone->ds_phys->ds_unique_bytes);
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	/*
 	 * Reset origin's unique bytes, if it exists.
 	 */
 	if (clone->ds_prev) {
 		dsl_dataset_t *origin = clone->ds_prev;
 		uint64_t comp, uncomp;
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		dsl_deadlist_space_range(&clone->ds_deadlist,
-		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-		    &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
+		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
+		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
 	}
 
 	/* swap blkptrs */
 	{
 		blkptr_t tmp;
-		tmp = origin_head->ds_phys->ds_bp;
-		origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp;
-		clone->ds_phys->ds_bp = tmp;
+		tmp = dsl_dataset_phys(origin_head)->ds_bp;
+		dsl_dataset_phys(origin_head)->ds_bp =
+		    dsl_dataset_phys(clone)->ds_bp;
+		dsl_dataset_phys(clone)->ds_bp = tmp;
 	}
 
 	/* set dd_*_bytes */
 	{
 		int64_t dused, dcomp, duncomp;
 		uint64_t cdl_used, cdl_comp, cdl_uncomp;
 		uint64_t odl_used, odl_comp, odl_uncomp;
 
-		ASSERT3U(clone->ds_dir->dd_phys->
+		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
 		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
 		dsl_deadlist_space(&clone->ds_deadlist,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
 		dsl_deadlist_space(&origin_head->ds_deadlist,
 		    &odl_used, &odl_comp, &odl_uncomp);
 
-		dused = clone->ds_phys->ds_referenced_bytes + cdl_used -
-		    (origin_head->ds_phys->ds_referenced_bytes + odl_used);
-		dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp -
-		    (origin_head->ds_phys->ds_compressed_bytes + odl_comp);
-		duncomp = clone->ds_phys->ds_uncompressed_bytes +
+		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
+		    cdl_used -
+		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
+		    odl_used);
+		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
+		    cdl_comp -
+		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
+		    odl_comp);
+		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
 		    cdl_uncomp -
-		    (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
+		    odl_uncomp);
 
 		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
 		    dused, dcomp, duncomp, tx);
 		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
 		    -dused, -dcomp, -duncomp, tx);
 
 		/*
 		 * The difference in the space used by snapshots is the
 		 * difference in snapshot space due to the head's
 		 * deadlist (since that's the only thing that's
 		 * changing that affects the snapused).
 		 */
 		dsl_deadlist_space_range(&clone->ds_deadlist,
 		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
 		dsl_deadlist_space_range(&origin_head->ds_deadlist,
 		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &odl_used, &odl_comp, &odl_uncomp);
 		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
 		    DD_USED_HEAD, DD_USED_SNAP, tx);
 	}
 
 	/* swap ds_*_bytes */
-	SWITCH64(origin_head->ds_phys->ds_referenced_bytes,
-	    clone->ds_phys->ds_referenced_bytes);
-	SWITCH64(origin_head->ds_phys->ds_compressed_bytes,
-	    clone->ds_phys->ds_compressed_bytes);
-	SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes,
-	    clone->ds_phys->ds_uncompressed_bytes);
-	SWITCH64(origin_head->ds_phys->ds_unique_bytes,
-	    clone->ds_phys->ds_unique_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
+	    dsl_dataset_phys(clone)->ds_referenced_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
+	    dsl_dataset_phys(clone)->ds_compressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
+	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	/* apply any parent delta for change in unconsumed refreservation */
 	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
 	    unused_refres_delta, 0, 0, tx);
 
 	/*
 	 * Swap deadlists.
 	 */
 	dsl_deadlist_close(&clone->ds_deadlist);
 	dsl_deadlist_close(&origin_head->ds_deadlist);
-	SWITCH64(origin_head->ds_phys->ds_deadlist_obj,
-	    clone->ds_phys->ds_deadlist_obj);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
 	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
-	    clone->ds_phys->ds_deadlist_obj);
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
 	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
-	    origin_head->ds_phys->ds_deadlist_obj);
+	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
 
 	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
 
 	spa_history_log_internal_ds(clone, "clone swap", tx,
 	    "parent=%s", origin_head->ds_dir->dd_myname);
 }
 
 /*
  * Given a pool name and a dataset object number in that pool,
  * return the name of that dataset.
  */
 int
 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_pool_hold(pname, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 	if (error == 0) {
 		dsl_dataset_name(ds, buf);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dsl_pool_rele(dp, FTAG);
 
 	return (error);
 }
 
 int
 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
 {
 	int error = 0;
 
 	ASSERT3S(asize, >, 0);
 
 	/*
 	 * *ref_rsrv is the portion of asize that will come from any
 	 * unconsumed refreservation space.
 	 */
 	*ref_rsrv = 0;
 
 	mutex_enter(&ds->ds_lock);
 	/*
 	 * Make a space adjustment for reserved bytes.
 	 */
-	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
 		ASSERT3U(*used, >=,
-		    ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
-		*used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+		*used -=
+		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
 		*ref_rsrv =
 		    asize - MIN(asize, parent_delta(ds, asize + inflight));
 	}
 
 	if (!check_quota || ds->ds_quota == 0) {
 		mutex_exit(&ds->ds_lock);
 		return (0);
 	}
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota, they get to try again unless the actual
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
-	if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
+	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
+	    ds->ds_quota) {
 		if (inflight > 0 ||
-		    ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
+		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
 			error = SET_ERROR(ERESTART);
 		else
 			error = SET_ERROR(EDQUOT);
 	}
 	mutex_exit(&ds->ds_lock);
 
 	return (error);
 }
 
 typedef struct dsl_dataset_set_qr_arg {
 	const char *ddsqra_name;
 	zprop_source_t ddsqra_source;
 	uint64_t ddsqra_value;
 } dsl_dataset_set_qr_arg_t;
 
 
 /* ARGSUSED */
 static int
 dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 	uint64_t newval;
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
 		return (SET_ERROR(ENOTSUP));
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (dsl_dataset_is_snapshot(ds)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = dsl_prop_predict(ds->ds_dir,
 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	if (newval == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
-	if (newval < ds->ds_phys->ds_referenced_bytes ||
+	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
 	    newval < ds->ds_reserved) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t newval;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
 	dsl_prop_set_sync_impl(ds,
 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
 	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 	    &ddsqra->ddsqra_value, tx);
 
 	VERIFY0(dsl_prop_get_int_ds(ds,
 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
 
 	if (ds->ds_quota != newval) {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ds->ds_quota = newval;
 	}
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
     uint64_t refquota)
 {
 	dsl_dataset_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = dsname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = refquota;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
 	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 static int
 dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 	uint64_t newval, unique;
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
 		return (SET_ERROR(ENOTSUP));
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (dsl_dataset_is_snapshot(ds)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = dsl_prop_predict(ds->ds_dir,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	mutex_enter(&ds->ds_lock);
 	if (!DS_UNIQUE_IS_ACCURATE(ds))
 		dsl_dataset_recalc_head_uniq(ds);
-	unique = ds->ds_phys->ds_unique_bytes;
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
 	mutex_exit(&ds->ds_lock);
 
 	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
 		uint64_t delta = MAX(unique, newval) -
 		    MAX(unique, ds->ds_reserved);
 
 		if (delta >
 		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
 		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(ENOSPC));
 		}
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
     zprop_source_t source, uint64_t value, dmu_tx_t *tx)
 {
 	uint64_t newval;
 	uint64_t unique;
 	int64_t delta;
 
 	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 	    source, sizeof (value), 1, &value, tx);
 
 	VERIFY0(dsl_prop_get_int_ds(ds,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
-	unique = ds->ds_phys->ds_unique_bytes;
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
 	delta = MAX(0, (int64_t)(newval - unique)) -
 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
 	ds->ds_reserved = newval;
 	mutex_exit(&ds->ds_lock);
 
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
 	mutex_exit(&ds->ds_dir->dd_lock);
 }
 
 static void
 dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 	dsl_dataset_set_refreservation_sync_impl(ds,
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
     uint64_t refreservation)
 {
 	dsl_dataset_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = dsname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = refreservation;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
 	    dsl_dataset_set_refreservation_sync, &ddsqra,
 	    0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
  * Return (in *usedp) the amount of space written in new that is not
  * present in oldsnap.  New may be a snapshot or the head.  Old must be
  * a snapshot before new, in new's filesystem (or its origin).  If not then
  * fail and return EINVAL.
  *
  * The written space is calculated by considering two components:  First, we
  * ignore any freed space, and calculate the written as new's used space
  * minus old's used space.  Next, we add in the amount of space that was freed
  * between the two snapshots, thus reducing new's used space relative to old's.
  * Specifically, this is the space that was born before old->ds_creation_txg,
  * and freed before new (ie. on new's deadlist or a previous deadlist).
  *
  * space freed                         [---------------------]
  * snapshots                       ---O-------O--------O-------O------
  *                                         oldsnap            new
  */
 int
 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	int err = 0;
 	uint64_t snapobj;
 	dsl_pool_t *dp = new->ds_dir->dd_pool;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	*usedp = 0;
-	*usedp += new->ds_phys->ds_referenced_bytes;
-	*usedp -= oldsnap->ds_phys->ds_referenced_bytes;
+	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
+	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
 
 	*compp = 0;
-	*compp += new->ds_phys->ds_compressed_bytes;
-	*compp -= oldsnap->ds_phys->ds_compressed_bytes;
+	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
+	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
 
 	*uncompp = 0;
-	*uncompp += new->ds_phys->ds_uncompressed_bytes;
-	*uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
+	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
+	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
 
 	snapobj = new->ds_object;
 	while (snapobj != oldsnap->ds_object) {
 		dsl_dataset_t *snap;
 		uint64_t used, comp, uncomp;
 
 		if (snapobj == new->ds_object) {
 			snap = new;
 		} else {
 			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
 			if (err != 0)
 				break;
 		}
 
-		if (snap->ds_phys->ds_prev_snap_txg ==
-		    oldsnap->ds_phys->ds_creation_txg) {
+		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
+		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
 			/*
 			 * The blocks in the deadlist can not be born after
 			 * ds_prev_snap_txg, so get the whole deadlist space,
 			 * which is more efficient (especially for old-format
 			 * deadlists).  Unfortunately the deadlist code
 			 * doesn't have enough information to make this
 			 * optimization itself.
 			 */
 			dsl_deadlist_space(&snap->ds_deadlist,
 			    &used, &comp, &uncomp);
 		} else {
 			dsl_deadlist_space_range(&snap->ds_deadlist,
-			    0, oldsnap->ds_phys->ds_creation_txg,
+			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
 			    &used, &comp, &uncomp);
 		}
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
 		/*
 		 * If we get to the beginning of the chain of snapshots
 		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
 		 * was not a snapshot of/before new.
 		 */
-		snapobj = snap->ds_phys->ds_prev_snap_obj;
+		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 		if (snap != new)
 			dsl_dataset_rele(snap, FTAG);
 		if (snapobj == 0) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
 
 	}
 	return (err);
 }
 
 /*
  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
  * lastsnap, and all snapshots in between are deleted.
  *
  * blocks that would be freed            [---------------------------]
  * snapshots                       ---O-------O--------O-------O--------O
  *                                        firstsnap        lastsnap
  *
  * This is the set of blocks that were born after the snap before firstsnap,
  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
  * We calculate this by iterating over the relevant deadlists (from the snap
  * after lastsnap, backward to the snap after firstsnap), summing up the
  * space on the deadlist that was born after the snap before firstsnap.
  */
 int
 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
     dsl_dataset_t *lastsnap,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	int err = 0;
 	uint64_t snapobj;
 	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
 
 	ASSERT(dsl_dataset_is_snapshot(firstsnap));
 	ASSERT(dsl_dataset_is_snapshot(lastsnap));
 
 	/*
 	 * Check that the snapshots are in the same dsl_dir, and firstsnap
 	 * is before lastsnap.
 	 */
 	if (firstsnap->ds_dir != lastsnap->ds_dir ||
-	    firstsnap->ds_phys->ds_creation_txg >
-	    lastsnap->ds_phys->ds_creation_txg)
+	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
+	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
 		return (SET_ERROR(EINVAL));
 
 	*usedp = *compp = *uncompp = 0;
 
-	snapobj = lastsnap->ds_phys->ds_next_snap_obj;
+	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
 	while (snapobj != firstsnap->ds_object) {
 		dsl_dataset_t *ds;
 		uint64_t used, comp, uncomp;
 
 		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
 		if (err != 0)
 			break;
 
 		dsl_deadlist_space_range(&ds->ds_deadlist,
-		    firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
 		    &used, &comp, &uncomp);
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
-		snapobj = ds->ds_phys->ds_prev_snap_obj;
+		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		ASSERT3U(snapobj, !=, 0);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	return (err);
 }
 
 static int
 dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
 {
 	const char *dsname = arg;
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int error = 0;
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
 		return (SET_ERROR(ENOTSUP));
 
 	ASSERT(spa_feature_is_enabled(dp->dp_spa,
 	    SPA_FEATURE_EXTENSIBLE_DATASET));
 
 	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (ds->ds_large_blocks)
 		error = EALREADY;
 	dsl_dataset_rele(ds, FTAG);
 
 	return (error);
 }
 
 void
 dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
 	uint64_t zero = 0;
 
 	spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
 	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
 
 	VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
 	    sizeof (zero), 1, &zero, tx));
 }
 
 static void
 dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
 {
 	const char *dsname = arg;
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
 
 	dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
 	ASSERT(!ds->ds_large_blocks);
 	ds->ds_large_blocks = B_TRUE;
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_activate_large_blocks(const char *dsname)
 {
 	int error;
 
 	error = dsl_sync_task(dsname,
 	    dsl_dataset_activate_large_blocks_check,
 	    dsl_dataset_activate_large_blocks_sync, (void *)dsname,
 	    1, ZFS_SPACE_CHECK_RESERVED);
 
 	/*
 	 * EALREADY indicates that this dataset already supports large blocks.
 	 */
 	if (error == EALREADY)
 		error = 0;
 	return (error);
 }
 
 /*
  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
  * For example, they could both be snapshots of the same filesystem, and
  * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
  * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
  * filesystem.  Or 'earlier' could be the origin's origin.
  *
  * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
  */
 boolean_t
 dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
 	uint64_t earlier_txg)
 {
 	dsl_pool_t *dp = later->ds_dir->dd_pool;
 	int error;
 	boolean_t ret;
 
 	ASSERT(dsl_pool_config_held(dp));
 	ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0);
 
 	if (earlier_txg == 0)
-		earlier_txg = earlier->ds_phys->ds_creation_txg;
+		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
 
 	if (dsl_dataset_is_snapshot(later) &&
-	    earlier_txg >= later->ds_phys->ds_creation_txg)
+	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
 		return (B_FALSE);
 
 	if (later->ds_dir == earlier->ds_dir)
 		return (B_TRUE);
 	if (!dsl_dir_is_clone(later->ds_dir))
 		return (B_FALSE);
 
-	if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object)
+	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
 		return (B_TRUE);
 	dsl_dataset_t *origin;
 	error = dsl_dataset_hold_obj(dp,
-	    later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
+	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
 	if (error != 0)
 		return (B_FALSE);
 	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
 	dsl_dataset_rele(origin, FTAG);
 	return (ret);
 }
 
 
 void
 dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c	(revision 275782)
@@ -1,534 +1,535 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_dataset.h>
 #include <sys/dmu.h>
 #include <sys/refcount.h>
 #include <sys/zap.h>
 #include <sys/zfs_context.h>
 #include <sys/dsl_pool.h>
 
 /*
  * Deadlist concurrency:
  *
  * Deadlists can only be modified from the syncing thread.
  *
  * Except for dsl_deadlist_insert(), it can only be modified with the
  * dp_config_rwlock held with RW_WRITER.
  *
  * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
  * be called concurrently, from open context, with the dl_config_rwlock held
  * with RW_READER.
  *
  * Therefore, we only need to provide locking between dsl_deadlist_insert() and
  * the accessors, protecting:
  *     dl_phys->dl_used,comp,uncomp
  *     and protecting the dl_tree from being loaded.
  * The locking is provided by dl_lock.  Note that locking on the bpobj_t
  * provides its own locking, and dl_oldfmt is immutable.
  */
 
 static int
 dsl_deadlist_compare(const void *arg1, const void *arg2)
 {
 	const dsl_deadlist_entry_t *dle1 = arg1;
 	const dsl_deadlist_entry_t *dle2 = arg2;
 
 	if (dle1->dle_mintxg < dle2->dle_mintxg)
 		return (-1);
 	else if (dle1->dle_mintxg > dle2->dle_mintxg)
 		return (+1);
 	else
 		return (0);
 }
 
 static void
 dsl_deadlist_load_tree(dsl_deadlist_t *dl)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	ASSERT(!dl->dl_oldfmt);
 	if (dl->dl_havetree)
 		return;
 
 	avl_create(&dl->dl_tree, dsl_deadlist_compare,
 	    sizeof (dsl_deadlist_entry_t),
 	    offsetof(dsl_deadlist_entry_t, dle_node));
 	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
 		dle->dle_mintxg = strtonum(za.za_name, NULL);
 		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
 		    za.za_first_integer));
 		avl_add(&dl->dl_tree, dle);
 	}
 	zap_cursor_fini(&zc);
 	dl->dl_havetree = B_TRUE;
 }
 
 void
 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
 {
 	dmu_object_info_t doi;
 
 	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
 	dl->dl_os = os;
 	dl->dl_object = object;
 	VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
 	dmu_object_info_from_db(dl->dl_dbuf, &doi);
 	if (doi.doi_type == DMU_OT_BPOBJ) {
 		dmu_buf_rele(dl->dl_dbuf, dl);
 		dl->dl_dbuf = NULL;
 		dl->dl_oldfmt = B_TRUE;
 		VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
 		return;
 	}
 
 	dl->dl_oldfmt = B_FALSE;
 	dl->dl_phys = dl->dl_dbuf->db_data;
 	dl->dl_havetree = B_FALSE;
 }
 
 void
 dsl_deadlist_close(dsl_deadlist_t *dl)
 {
 	void *cookie = NULL;
 	dsl_deadlist_entry_t *dle;
 
 	if (dl->dl_oldfmt) {
 		dl->dl_oldfmt = B_FALSE;
 		bpobj_close(&dl->dl_bpobj);
 		return;
 	}
 
 	if (dl->dl_havetree) {
 		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
 		    != NULL) {
 			bpobj_close(&dle->dle_bpobj);
 			kmem_free(dle, sizeof (*dle));
 		}
 		avl_destroy(&dl->dl_tree);
 	}
 	dmu_buf_rele(dl->dl_dbuf, dl);
 	mutex_destroy(&dl->dl_lock);
 	dl->dl_dbuf = NULL;
 	dl->dl_phys = NULL;
 }
 
 uint64_t
 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
 {
 	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
 		return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
 	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
 	    sizeof (dsl_deadlist_phys_t), tx));
 }
 
 void
 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
 {
 	dmu_object_info_t doi;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
 	if (doi.doi_type == DMU_OT_BPOBJ) {
 		bpobj_free(os, dlobj, tx);
 		return;
 	}
 
 	for (zap_cursor_init(&zc, os, dlobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t obj = za.za_first_integer;
 		if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
 			bpobj_decr_empty(os, tx);
 		else
 			bpobj_free(os, obj, tx);
 	}
 	zap_cursor_fini(&zc);
 	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
 }
 
 static void
 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
     const blkptr_t *bp, dmu_tx_t *tx)
 {
 	if (dle->dle_bpobj.bpo_object ==
 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
 		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 		bpobj_close(&dle->dle_bpobj);
 		bpobj_decr_empty(dl->dl_os, tx);
 		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, obj, tx));
 	}
 	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
 }
 
 static void
 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
     uint64_t obj, dmu_tx_t *tx)
 {
 	if (dle->dle_bpobj.bpo_object !=
 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
 		bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
 	} else {
 		bpobj_close(&dle->dle_bpobj);
 		bpobj_decr_empty(dl->dl_os, tx);
 		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, obj, tx));
 	}
 }
 
 void
 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;
 
 	if (dl->dl_oldfmt) {
 		bpobj_enqueue(&dl->dl_bpobj, bp, tx);
 		return;
 	}
 
 	dsl_deadlist_load_tree(dl);
 
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	mutex_enter(&dl->dl_lock);
 	dl->dl_phys->dl_used +=
 	    bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
 	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
 	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
 	mutex_exit(&dl->dl_lock);
 
 	dle_tofind.dle_mintxg = bp->blk_birth;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 	else
 		dle = AVL_PREV(&dl->dl_tree, dle);
 	dle_enqueue(dl, dle, bp, tx);
 }
 
 /*
  * Insert new key in deadlist, which must be > all current entries.
  * mintxg is not inclusive.
  */
 void
 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 {
 	uint64_t obj;
 	dsl_deadlist_entry_t *dle;
 
 	if (dl->dl_oldfmt)
 		return;
 
 	dsl_deadlist_load_tree(dl);
 
 	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
 	dle->dle_mintxg = mintxg;
 	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 	avl_add(&dl->dl_tree, dle);
 
 	VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
 	    mintxg, obj, tx));
 }
 
 /*
  * Remove this key, merging its entries into the previous key.
  */
 void
 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle, *dle_prev;
 
 	if (dl->dl_oldfmt)
 		return;
 
 	dsl_deadlist_load_tree(dl);
 
 	dle_tofind.dle_mintxg = mintxg;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
 	dle_prev = AVL_PREV(&dl->dl_tree, dle);
 
 	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
 
 	avl_remove(&dl->dl_tree, dle);
 	bpobj_close(&dle->dle_bpobj);
 	kmem_free(dle, sizeof (*dle));
 
 	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
 }
 
 /*
  * Walk ds's snapshots to regenerate generate ZAP & AVL.
  */
 static void
 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
     uint64_t mrs_obj, dmu_tx_t *tx)
 {
 	dsl_deadlist_t dl;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	dsl_deadlist_open(&dl, os, dlobj);
 	if (dl.dl_oldfmt) {
 		dsl_deadlist_close(&dl);
 		return;
 	}
 
 	while (mrs_obj != 0) {
 		dsl_dataset_t *ds;
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
-		dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
-		mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+		dsl_deadlist_add_key(&dl,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+		mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dsl_deadlist_close(&dl);
 }
 
 uint64_t
 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
     uint64_t mrs_obj, dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t *dle;
 	uint64_t newobj;
 
 	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
 
 	if (dl->dl_oldfmt) {
 		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
 		return (newobj);
 	}
 
 	dsl_deadlist_load_tree(dl);
 
 	for (dle = avl_first(&dl->dl_tree); dle;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		uint64_t obj;
 
 		if (dle->dle_mintxg >= maxtxg)
 			break;
 
 		obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
 		    dle->dle_mintxg, obj, tx));
 	}
 	return (newobj);
 }
 
 void
 dsl_deadlist_space(dsl_deadlist_t *dl,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	if (dl->dl_oldfmt) {
 		VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
 		    usedp, compp, uncompp));
 		return;
 	}
 
 	mutex_enter(&dl->dl_lock);
 	*usedp = dl->dl_phys->dl_used;
 	*compp = dl->dl_phys->dl_comp;
 	*uncompp = dl->dl_phys->dl_uncomp;
 	mutex_exit(&dl->dl_lock);
 }
 
 /*
  * return space used in the range (mintxg, maxtxg].
  * Includes maxtxg, does not include mintxg.
  * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
  * larger than any bp in the deadlist (eg. UINT64_MAX)).
  */
 void
 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	dsl_deadlist_entry_t *dle;
 	dsl_deadlist_entry_t dle_tofind;
 	avl_index_t where;
 
 	if (dl->dl_oldfmt) {
 		VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
 		    mintxg, maxtxg, usedp, compp, uncompp));
 		return;
 	}
 
 	*usedp = *compp = *uncompp = 0;
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 	dle_tofind.dle_mintxg = mintxg;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	/*
 	 * If we don't find this mintxg, there shouldn't be anything
 	 * after it either.
 	 */
 	ASSERT(dle != NULL ||
 	    avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
 
 	for (; dle && dle->dle_mintxg < maxtxg;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		uint64_t used, comp, uncomp;
 
 		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
 		    &used, &comp, &uncomp));
 
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 	}
 	mutex_exit(&dl->dl_lock);
 }
 
 static void
 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;
 	uint64_t used, comp, uncomp;
 	bpobj_t bpo;
 
 	VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
 	VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
 	bpobj_close(&bpo);
 
 	dsl_deadlist_load_tree(dl);
 
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	mutex_enter(&dl->dl_lock);
 	dl->dl_phys->dl_used += used;
 	dl->dl_phys->dl_comp += comp;
 	dl->dl_phys->dl_uncomp += uncomp;
 	mutex_exit(&dl->dl_lock);
 
 	dle_tofind.dle_mintxg = birth;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 	dle_enqueue_subobj(dl, dle, obj, tx);
 }
 
 static int
 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_deadlist_t *dl = arg;
 	dsl_deadlist_insert(dl, bp, tx);
 	return (0);
 }
 
 /*
  * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
  * an empty deadlist.
  */
 void
 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	dmu_buf_t *bonus;
 	dsl_deadlist_phys_t *dlp;
 	dmu_object_info_t doi;
 
 	VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
 	if (doi.doi_type == DMU_OT_BPOBJ) {
 		bpobj_t bpo;
 		VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
 		VERIFY3U(0, ==, bpobj_iterate(&bpo,
 		    dsl_deadlist_insert_cb, dl, tx));
 		bpobj_close(&bpo);
 		return;
 	}
 
 	for (zap_cursor_init(&zc, dl->dl_os, obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t mintxg = strtonum(za.za_name, NULL);
 		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
 		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
 	}
 	zap_cursor_fini(&zc);
 
 	VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
 	dlp = bonus->db_data;
 	dmu_buf_will_dirty(bonus, tx);
 	bzero(dlp, sizeof (*dlp));
 	dmu_buf_rele(bonus, FTAG);
 }
 
 /*
  * Remove entries on dl that are >= mintxg, and put them on the bpobj.
  */
 void
 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;
 
 	ASSERT(!dl->dl_oldfmt);
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	dsl_deadlist_load_tree(dl);
 
 	dle_tofind.dle_mintxg = mintxg;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
 	while (dle) {
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_entry_t *dle_next;
 
 		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
 
 		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
 		    &used, &comp, &uncomp));
 		mutex_enter(&dl->dl_lock);
 		ASSERT3U(dl->dl_phys->dl_used, >=, used);
 		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
 		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
 		dl->dl_phys->dl_used -= used;
 		dl->dl_phys->dl_comp -= comp;
 		dl->dl_phys->dl_uncomp -= uncomp;
 		mutex_exit(&dl->dl_lock);
 
 		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, tx));
 
 		dle_next = AVL_NEXT(&dl->dl_tree, dle);
 		avl_remove(&dl->dl_tree, dle);
 		bpobj_close(&dle->dle_bpobj);
 		kmem_free(dle, sizeof (*dle));
 		dle = dle_next;
 	}
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c	(revision 275782)
@@ -1,761 +1,761 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
 /*
  * DSL permissions are stored in a two level zap attribute
  * mechanism.   The first level identifies the "class" of
  * entry.  The class is identified by the first 2 letters of
  * the attribute.  The second letter "l" or "d" identifies whether
  * it is a local or descendent permission.  The first letter
  * identifies the type of entry.
  *
  * ul$<id>    identifies permissions granted locally for this userid.
  * ud$<id>    identifies permissions granted on descendent datasets for
  *            this userid.
  * Ul$<id>    identifies permission sets granted locally for this userid.
  * Ud$<id>    identifies permission sets granted on descendent datasets for
  *            this userid.
  * gl$<id>    identifies permissions granted locally for this groupid.
  * gd$<id>    identifies permissions granted on descendent datasets for
  *            this groupid.
  * Gl$<id>    identifies permission sets granted locally for this groupid.
  * Gd$<id>    identifies permission sets granted on descendent datasets for
  *            this groupid.
  * el$        identifies permissions granted locally for everyone.
  * ed$        identifies permissions granted on descendent datasets
  *            for everyone.
  * El$        identifies permission sets granted locally for everyone.
  * Ed$        identifies permission sets granted to descendent datasets for
  *            everyone.
  * c-$        identifies permission to create at dataset creation time.
  * C-$        identifies permission sets to grant locally at dataset creation
  *            time.
  * s-$@<name> permissions defined in specified set @<name>
  * S-$@<name> Sets defined in named set @<name>
  *
  * Each of the above entities points to another zap attribute that contains one
  * attribute for each allowed permission, such as create, destroy,...
  * All of the "upper" case class types will specify permission set names
  * rather than permissions.
  *
  * Basically it looks something like this:
  * ul$12 -> ZAP OBJ -> permissions...
  *
  * The ZAP OBJ is referred to as the jump object.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/cred.h>
 #include <sys/sunddi.h>
 
 #include "zfs_deleg.h"
 
 /*
  * Validate that user is allowed to delegate specified permissions.
  *
  * In order to delegate "create" you must have "create"
  * and "allow".
  */
 int
 dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr)
 {
 	nvpair_t *whopair = NULL;
 	int error;
 
 	if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
 		return (error);
 
 	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
 		nvlist_t *perms;
 		nvpair_t *permpair = NULL;
 
 		VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
 
 		while (permpair = nvlist_next_nvpair(perms, permpair)) {
 			const char *perm = nvpair_name(permpair);
 
 			if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
 				return (SET_ERROR(EPERM));
 
 			if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 /*
  * Validate that user is allowed to unallow specified permissions.  They
  * must have the 'allow' permission, and even then can only unallow
  * perms for their uid.
  */
 int
 dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
 {
 	nvpair_t *whopair = NULL;
 	int error;
 	char idstr[32];
 
 	if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
 		return (error);
 
 	(void) snprintf(idstr, sizeof (idstr), "%lld",
 	    (longlong_t)crgetuid(cr));
 
 	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
 		zfs_deleg_who_type_t type = nvpair_name(whopair)[0];
 
 		if (type != ZFS_DELEG_USER &&
 		    type != ZFS_DELEG_USER_SETS)
 			return (SET_ERROR(EPERM));
 
 		if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
 			return (SET_ERROR(EPERM));
 	}
 	return (0);
 }
 
 typedef struct dsl_deleg_arg {
 	const char *dda_name;
 	nvlist_t *dda_nvlist;
 } dsl_deleg_arg_t;
 
 static void
 dsl_deleg_set_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_deleg_arg_t *dda = arg;
 	dsl_dir_t *dd;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	nvpair_t *whopair = NULL;
 	uint64_t zapobj;
 
 	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
 
-	zapobj = dd->dd_phys->dd_deleg_zapobj;
+	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 	if (zapobj == 0) {
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
 		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
 	}
 
 	while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
 		const char *whokey = nvpair_name(whopair);
 		nvlist_t *perms;
 		nvpair_t *permpair = NULL;
 		uint64_t jumpobj;
 
 		perms = fnvpair_value_nvlist(whopair);
 
 		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
 			jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
 			    zapobj, whokey, tx);
 		}
 
 		while (permpair = nvlist_next_nvpair(perms, permpair)) {
 			const char *perm = nvpair_name(permpair);
 			uint64_t n = 0;
 
 			VERIFY(zap_update(mos, jumpobj,
 			    perm, 8, 1, &n, tx) == 0);
 			spa_history_log_internal_dd(dd, "permission update", tx,
 			    "%s %s", whokey, perm);
 		}
 	}
 	dsl_dir_rele(dd, FTAG);
 }
 
 static void
 dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_deleg_arg_t *dda = arg;
 	dsl_dir_t *dd;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	nvpair_t *whopair = NULL;
 	uint64_t zapobj;
 
 	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
-	zapobj = dd->dd_phys->dd_deleg_zapobj;
+	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 	if (zapobj == 0) {
 		dsl_dir_rele(dd, FTAG);
 		return;
 	}
 
 	while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
 		const char *whokey = nvpair_name(whopair);
 		nvlist_t *perms;
 		nvpair_t *permpair = NULL;
 		uint64_t jumpobj;
 
 		if (nvpair_value_nvlist(whopair, &perms) != 0) {
 			if (zap_lookup(mos, zapobj, whokey, 8,
 			    1, &jumpobj) == 0) {
 				(void) zap_remove(mos, zapobj, whokey, tx);
 				VERIFY(0 == zap_destroy(mos, jumpobj, tx));
 			}
 			spa_history_log_internal_dd(dd, "permission who remove",
 			    tx, "%s", whokey);
 			continue;
 		}
 
 		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0)
 			continue;
 
 		while (permpair = nvlist_next_nvpair(perms, permpair)) {
 			const char *perm = nvpair_name(permpair);
 			uint64_t n = 0;
 
 			(void) zap_remove(mos, jumpobj, perm, tx);
 			if (zap_count(mos, jumpobj, &n) == 0 && n == 0) {
 				(void) zap_remove(mos, zapobj,
 				    whokey, tx);
 				VERIFY(0 == zap_destroy(mos,
 				    jumpobj, tx));
 			}
 			spa_history_log_internal_dd(dd, "permission remove", tx,
 			    "%s %s", whokey, perm);
 		}
 	}
 	dsl_dir_rele(dd, FTAG);
 }
 
 static int
 dsl_deleg_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_deleg_arg_t *dda = arg;
 	dsl_dir_t *dd;
 	int error;
 
 	if (spa_version(dmu_tx_pool(tx)->dp_spa) <
 	    SPA_VERSION_DELEGATED_PERMS) {
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL);
 	if (error == 0)
 		dsl_dir_rele(dd, FTAG);
 	return (error);
 }
 
 int
 dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
 {
 	dsl_deleg_arg_t dda;
 
 	/* nvp must already have been verified to be valid */
 
 	dda.dda_name = ddname;
 	dda.dda_nvlist = nvp;
 
 	return (dsl_sync_task(ddname, dsl_deleg_check,
 	    unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
 	    &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
 }
 
 /*
  * Find all 'allow' permissions from a given point and then continue
  * traversing up to the root.
  *
  * This function constructs an nvlist of nvlists.
  * each setpoint is an nvlist composed of an nvlist of an nvlist
  * of the individual * users/groups/everyone/create
  * permissions.
  *
  * The nvlist will look like this.
  *
  * { source fsname -> { whokeys { permissions,...}, ...}}
  *
  * The fsname nvpairs will be arranged in a bottom up order.  For example,
  * if we have the following structure a/b/c then the nvpairs for the fsnames
  * will be ordered a/b/c, a/b, a.
  */
 int
 dsl_deleg_get(const char *ddname, nvlist_t **nvp)
 {
 	dsl_dir_t *dd, *startdd;
 	dsl_pool_t *dp;
 	int error;
 	objset_t *mos;
 
 	error = dsl_pool_hold(ddname, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	dp = startdd->dd_pool;
 	mos = dp->dp_meta_objset;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
 		zap_cursor_t basezc;
 		zap_attribute_t baseza;
 		nvlist_t *sp_nvp;
 		uint64_t n;
 		char source[MAXNAMELEN];
 
-		if (dd->dd_phys->dd_deleg_zapobj == 0 ||
-		    zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 ||
-		    n == 0)
+		if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 ||
+		    zap_count(mos,
+		    dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0)
 			continue;
 
 		sp_nvp = fnvlist_alloc();
 		for (zap_cursor_init(&basezc, mos,
-		    dd->dd_phys->dd_deleg_zapobj);
+		    dsl_dir_phys(dd)->dd_deleg_zapobj);
 		    zap_cursor_retrieve(&basezc, &baseza) == 0;
 		    zap_cursor_advance(&basezc)) {
 			zap_cursor_t zc;
 			zap_attribute_t za;
 			nvlist_t *perms_nvp;
 
 			ASSERT(baseza.za_integer_length == 8);
 			ASSERT(baseza.za_num_integers == 1);
 
 			perms_nvp = fnvlist_alloc();
 			for (zap_cursor_init(&zc, mos, baseza.za_first_integer);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
 				fnvlist_add_boolean(perms_nvp, za.za_name);
 			}
 			zap_cursor_fini(&zc);
 			fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp);
 			fnvlist_free(perms_nvp);
 		}
 
 		zap_cursor_fini(&basezc);
 
 		dsl_dir_name(dd, source);
 		fnvlist_add_nvlist(*nvp, source, sp_nvp);
 		nvlist_free(sp_nvp);
 	}
 
 	dsl_dir_rele(startdd, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (0);
 }
 
 /*
  * Routines for dsl_deleg_access() -- access checking.
  */
 typedef struct perm_set {
 	avl_node_t	p_node;
 	boolean_t	p_matched;
 	char		p_setname[ZFS_MAX_DELEG_NAME];
 } perm_set_t;
 
 static int
 perm_set_compare(const void *arg1, const void *arg2)
 {
 	const perm_set_t *node1 = arg1;
 	const perm_set_t *node2 = arg2;
 	int val;
 
 	val = strcmp(node1->p_setname, node2->p_setname);
 	if (val == 0)
 		return (0);
 	return (val > 0 ? 1 : -1);
 }
 
 /*
  * Determine whether a specified permission exists.
  *
  * First the base attribute has to be retrieved.  i.e. ul$12
  * Once the base object has been retrieved the actual permission
  * is lookup up in the zap object the base object points to.
  *
  * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if
  * there is no perm in that jumpobj.
  */
 static int
 dsl_check_access(objset_t *mos, uint64_t zapobj,
     char type, char checkflag, void *valp, const char *perm)
 {
 	int error;
 	uint64_t jumpobj, zero;
 	char whokey[ZFS_MAX_DELEG_NAME];
 
 	zfs_deleg_whokey(whokey, type, checkflag, valp);
 	error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
 	if (error == 0) {
 		error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
 		if (error == ENOENT)
 			error = SET_ERROR(EPERM);
 	}
 	return (error);
 }
 
 /*
  * check a specified user/group for a requested permission
  */
 static int
 dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm,
     int checkflag, cred_t *cr)
 {
 	const	gid_t *gids;
 	int	ngids;
 	int	i;
 	uint64_t id;
 
 	/* check for user */
 	id = crgetuid(cr);
 	if (dsl_check_access(mos, zapobj,
 	    ZFS_DELEG_USER, checkflag, &id, perm) == 0)
 		return (0);
 
 	/* check for users primary group */
 	id = crgetgid(cr);
 	if (dsl_check_access(mos, zapobj,
 	    ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
 		return (0);
 
 	/* check for everyone entry */
 	id = -1;
 	if (dsl_check_access(mos, zapobj,
 	    ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0)
 		return (0);
 
 	/* check each supplemental group user is a member of */
 	ngids = crgetngroups(cr);
 	gids = crgetgroups(cr);
 	for (i = 0; i != ngids; i++) {
 		id = gids[i];
 		if (dsl_check_access(mos, zapobj,
 		    ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
 			return (0);
 	}
 
 	return (SET_ERROR(EPERM));
 }
 
 /*
  * Iterate over the sets specified in the specified zapobj
  * and load them into the permsets avl tree.
  */
 static int
 dsl_load_sets(objset_t *mos, uint64_t zapobj,
     char type, char checkflag, void *valp, avl_tree_t *avl)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	perm_set_t *permnode;
 	avl_index_t idx;
 	uint64_t jumpobj;
 	int error;
 	char whokey[ZFS_MAX_DELEG_NAME];
 
 	zfs_deleg_whokey(whokey, type, checkflag, valp);
 
 	error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
 	if (error != 0)
 		return (error);
 
 	for (zap_cursor_init(&zc, mos, jumpobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP);
 		(void) strlcpy(permnode->p_setname, za.za_name,
 		    sizeof (permnode->p_setname));
 		permnode->p_matched = B_FALSE;
 
 		if (avl_find(avl, permnode, &idx) == NULL) {
 			avl_insert(avl, permnode, idx);
 		} else {
 			kmem_free(permnode, sizeof (perm_set_t));
 		}
 	}
 	zap_cursor_fini(&zc);
 	return (0);
 }
 
 /*
  * Load all permissions user based on cred belongs to.
  */
 static void
 dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
     char checkflag, cred_t *cr)
 {
 	const	gid_t *gids;
 	int	ngids, i;
 	uint64_t id;
 
 	id = crgetuid(cr);
 	(void) dsl_load_sets(mos, zapobj,
 	    ZFS_DELEG_USER_SETS, checkflag, &id, avl);
 
 	id = crgetgid(cr);
 	(void) dsl_load_sets(mos, zapobj,
 	    ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
 
 	(void) dsl_load_sets(mos, zapobj,
 	    ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl);
 
 	ngids = crgetngroups(cr);
 	gids = crgetgroups(cr);
 	for (i = 0; i != ngids; i++) {
 		id = gids[i];
 		(void) dsl_load_sets(mos, zapobj,
 		    ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
 	}
 }
 
 /*
  * Check if user has requested permission.
  */
 int
 dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp;
 	void *cookie;
 	int	error;
 	char	checkflag;
 	objset_t *mos;
 	avl_tree_t permsets;
 	perm_set_t *setnode;
 
 	dp = ds->ds_dir->dd_pool;
 	mos = dp->dp_meta_objset;
 
 	if (dsl_delegation_on(mos) == B_FALSE)
 		return (SET_ERROR(ECANCELED));
 
 	if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
 	    SPA_VERSION_DELEGATED_PERMS)
 		return (SET_ERROR(EPERM));
 
 	if (dsl_dataset_is_snapshot(ds)) {
 		/*
 		 * Snapshots are treated as descendents only,
 		 * local permissions do not apply.
 		 */
 		checkflag = ZFS_DELEG_DESCENDENT;
 	} else {
 		checkflag = ZFS_DELEG_LOCAL;
 	}
 
 	avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
 	    offsetof(perm_set_t, p_node));
 
 	ASSERT(dsl_pool_config_held(dp));
 	for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
 	    checkflag = ZFS_DELEG_DESCENDENT) {
 		uint64_t zapobj;
 		boolean_t expanded;
 
 		/*
 		 * If not in global zone then make sure
 		 * the zoned property is set
 		 */
 		if (!INGLOBALZONE(curthread)) {
 			uint64_t zoned;
 
 			if (dsl_prop_get_dd(dd,
 			    zfs_prop_to_name(ZFS_PROP_ZONED),
 			    8, 1, &zoned, NULL, B_FALSE) != 0)
 				break;
 			if (!zoned)
 				break;
 		}
-		zapobj = dd->dd_phys->dd_deleg_zapobj;
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 
 		if (zapobj == 0)
 			continue;
 
 		dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr);
 again:
 		expanded = B_FALSE;
 		for (setnode = avl_first(&permsets); setnode;
 		    setnode = AVL_NEXT(&permsets, setnode)) {
 			if (setnode->p_matched == B_TRUE)
 				continue;
 
 			/* See if this set directly grants this permission */
 			error = dsl_check_access(mos, zapobj,
 			    ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm);
 			if (error == 0)
 				goto success;
 			if (error == EPERM)
 				setnode->p_matched = B_TRUE;
 
 			/* See if this set includes other sets */
 			error = dsl_load_sets(mos, zapobj,
 			    ZFS_DELEG_NAMED_SET_SETS, 0,
 			    setnode->p_setname, &permsets);
 			if (error == 0)
 				setnode->p_matched = expanded = B_TRUE;
 		}
 		/*
 		 * If we expanded any sets, that will define more sets,
 		 * which we need to check.
 		 */
 		if (expanded)
 			goto again;
 
 		error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr);
 		if (error == 0)
 			goto success;
 	}
 	error = SET_ERROR(EPERM);
 success:
 
 	cookie = NULL;
 	while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
 		kmem_free(setnode, sizeof (perm_set_t));
 
 	return (error);
 }
 
 int
 dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_pool_hold(dsname, FTAG, &dp);
 	if (error != 0)
 		return (error);
 	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 	if (error == 0) {
 		error = dsl_deleg_access_impl(ds, perm, cr);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dsl_pool_rele(dp, FTAG);
 
 	return (error);
 }
 
 /*
  * Other routines.
  */
 
 static void
 copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
     boolean_t dosets, uint64_t uid, dmu_tx_t *tx)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	uint64_t jumpobj, pjumpobj;
-	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+	uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	char whokey[ZFS_MAX_DELEG_NAME];
 
 	zfs_deleg_whokey(whokey,
 	    dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE,
 	    ZFS_DELEG_LOCAL, NULL);
 	if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0)
 		return;
 
 	if (zapobj == 0) {
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
 		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
 	}
 
 	zfs_deleg_whokey(whokey,
 	    dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER,
 	    ZFS_DELEG_LOCAL, &uid);
 	if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) {
 		jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
 		VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0);
 	}
 
 	for (zap_cursor_init(&zc, mos, pjumpobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t zero = 0;
 		ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
 
 		VERIFY(zap_update(mos, jumpobj, za.za_name,
 		    8, 1, &zero, tx) == 0);
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*
  * set all create time permission on new dataset.
  */
 void
 dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr)
 {
 	dsl_dir_t *dd;
 	uint64_t uid = crgetuid(cr);
 
 	if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) <
 	    SPA_VERSION_DELEGATED_PERMS)
 		return;
 
 	for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
-		uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj;
+		uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 
 		if (pzapobj == 0)
 			continue;
 
 		copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx);
 		copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx);
 	}
 }
 
 int
 dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	if (zapobj == 0)
 		return (0);
 
 	for (zap_cursor_init(&zc, mos, zapobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
 		VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx));
 	}
 	zap_cursor_fini(&zc);
 	VERIFY(0 == zap_destroy(mos, zapobj, tx));
 	return (0);
 }
 
 boolean_t
 dsl_delegation_on(objset_t *os)
 {
 	return (!!spa_delegation(os->os_spa));
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c	(revision 275782)
@@ -1,956 +1,971 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_objset.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_impl.h>
 
 typedef struct dmu_snapshots_destroy_arg {
 	nvlist_t *dsda_snaps;
 	nvlist_t *dsda_successful_snaps;
 	boolean_t dsda_defer;
 	nvlist_t *dsda_errlist;
 } dmu_snapshots_destroy_arg_t;
 
 int
 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
 {
 	if (!dsl_dataset_is_snapshot(ds))
 		return (SET_ERROR(EINVAL));
 
 	if (dsl_dataset_long_held(ds))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Only allow deferred destroy on pools that support it.
 	 * NOTE: deferred destroy is only supported on snapshots.
 	 */
 	if (defer) {
 		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
 		    SPA_VERSION_USERREFS)
 			return (SET_ERROR(ENOTSUP));
 		return (0);
 	}
 
 	/*
 	 * If this snapshot has an elevated user reference count,
 	 * we can't destroy it yet.
 	 */
 	if (ds->ds_userrefs > 0)
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Can't delete a branch point.
 	 */
-	if (ds->ds_phys->ds_num_children > 1)
+	if (dsl_dataset_phys(ds)->ds_num_children > 1)
 		return (SET_ERROR(EEXIST));
 
 	return (0);
 }
 
 static int
 dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_snapshots_destroy_arg_t *dsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 	int error = 0;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) {
 		dsl_dataset_t *ds;
 
 		error = dsl_dataset_hold(dp, nvpair_name(pair),
 		    FTAG, &ds);
 
 		/*
 		 * If the snapshot does not exist, silently ignore it
 		 * (it's "already destroyed").
 		 */
 		if (error == ENOENT)
 			continue;
 
 		if (error == 0) {
 			error = dsl_destroy_snapshot_check_impl(ds,
 			    dsda->dsda_defer);
 			dsl_dataset_rele(ds, FTAG);
 		}
 
 		if (error == 0) {
 			fnvlist_add_boolean(dsda->dsda_successful_snaps,
 			    nvpair_name(pair));
 		} else {
 			fnvlist_add_int32(dsda->dsda_errlist,
 			    nvpair_name(pair), error);
 		}
 	}
 
 	pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL);
 	if (pair != NULL)
 		return (fnvpair_value_int32(pair));
 
 	return (0);
 }
 
 struct process_old_arg {
 	dsl_dataset_t *ds;
 	dsl_dataset_t *ds_prev;
 	boolean_t after_branch_point;
 	zio_t *pio;
 	uint64_t used, comp, uncomp;
 };
 
 static int
 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	struct process_old_arg *poa = arg;
 	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
 		    bp->blk_birth >
-		    poa->ds_prev->ds_phys->ds_prev_snap_txg) {
-			poa->ds_prev->ds_phys->ds_unique_bytes +=
+		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
+			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
 			    bp_get_dsize_sync(dp->dp_spa, bp);
 		}
 	} else {
 		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
 		poa->comp += BP_GET_PSIZE(bp);
 		poa->uncomp += BP_GET_UCSIZE(bp);
 		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
 	}
 	return (0);
 }
 
 static void
 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
 {
 	struct process_old_arg poa = { 0 };
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t deadlist_obj;
 
 	ASSERT(ds->ds_deadlist.dl_oldfmt);
 	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
 
 	poa.ds = ds;
 	poa.ds_prev = ds_prev;
 	poa.after_branch_point = after_branch_point;
 	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
 	    process_old_cb, &poa, tx));
 	VERIFY0(zio_wait(poa.pio));
-	ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
 
 	/* change snapused */
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 	    -poa.used, -poa.comp, -poa.uncomp, tx);
 
 	/* swap next's deadlist to our deadlist */
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_close(&ds_next->ds_deadlist);
-	deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-	ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj;
-	ds_next->ds_phys->ds_deadlist_obj = deadlist_obj;
-	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj;
+	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
 	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
-	    ds_next->ds_phys->ds_deadlist_obj);
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj);
 }
 
 static void
 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	/*
 	 * If it is the old version, dd_clones doesn't exist so we can't
 	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
 	 * doesn't matter.
 	 */
-	if (ds->ds_dir->dd_phys->dd_clones == 0)
+	if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
 		return;
 
-	for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+	for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
 
 		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 		    za.za_first_integer, FTAG, &clone));
 		if (clone->ds_dir->dd_origin_txg > mintxg) {
 			dsl_deadlist_remove_key(&clone->ds_deadlist,
 			    mintxg, tx);
 			dsl_dataset_remove_clones_key(clone, mintxg, tx);
 		}
 		dsl_dataset_rele(clone, FTAG);
 	}
 	zap_cursor_fini(&zc);
 }
 
 void
 dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 {
 	int err;
 	int after_branch_point = FALSE;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	dsl_dataset_t *ds_prev = NULL;
 	uint64_t obj;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
 	ASSERT(refcount_is_zero(&ds->ds_longholds));
 
 	if (defer &&
-	    (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) {
+	    (ds->ds_userrefs > 0 ||
+	    dsl_dataset_phys(ds)->ds_num_children > 1)) {
 		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
 		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
 		return;
 	}
 
-	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 
 	/* We need to log before removing it from the namespace. */
 	spa_history_log_internal_ds(ds, "destroy", tx, "");
 
 	dsl_scan_ds_destroyed(ds, tx);
 
 	obj = ds->ds_object;
 
 	if (ds->ds_large_blocks) {
 		ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
 		spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
 	}
-	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		ASSERT3P(ds->ds_prev, ==, NULL);
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
 		after_branch_point =
-		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
+		    (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
 
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
-		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
+		    dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
-			if (ds->ds_phys->ds_next_snap_obj != 0) {
+			if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
 				VERIFY0(zap_add_int(mos,
-				    ds_prev->ds_phys->ds_next_clones_obj,
-				    ds->ds_phys->ds_next_snap_obj, tx));
+				    dsl_dataset_phys(ds_prev)->
+				    ds_next_clones_obj,
+				    dsl_dataset_phys(ds)->ds_next_snap_obj,
+				    tx));
 			}
 		}
 		if (!after_branch_point) {
-			ds_prev->ds_phys->ds_next_snap_obj =
-			    ds->ds_phys->ds_next_snap_obj;
+			dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 		}
 	}
 
 	dsl_dataset_t *ds_next;
 	uint64_t old_unique;
 	uint64_t used = 0, comp = 0, uncomp = 0;
 
 	VERIFY0(dsl_dataset_hold_obj(dp,
-	    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
-	ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
+	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
 
-	old_unique = ds_next->ds_phys->ds_unique_bytes;
+	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
 	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
-	ds_next->ds_phys->ds_prev_snap_obj =
-	    ds->ds_phys->ds_prev_snap_obj;
-	ds_next->ds_phys->ds_prev_snap_txg =
-	    ds->ds_phys->ds_prev_snap_txg;
-	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-	    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+	    ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
 
 	if (ds_next->ds_deadlist.dl_oldfmt) {
 		process_old_deadlist(ds, ds_prev, ds_next,
 		    after_branch_point, tx);
 	} else {
 		/* Adjust prev's unique space. */
 		if (ds_prev && !after_branch_point) {
 			dsl_deadlist_space_range(&ds_next->ds_deadlist,
-			    ds_prev->ds_phys->ds_prev_snap_txg,
-			    ds->ds_phys->ds_prev_snap_txg,
+			    dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 			    &used, &comp, &uncomp);
-			ds_prev->ds_phys->ds_unique_bytes += used;
+			dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
 		}
 
 		/* Adjust snapused. */
 		dsl_deadlist_space_range(&ds_next->ds_deadlist,
-		    ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
 		    &used, &comp, &uncomp);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 		    -used, -comp, -uncomp, tx);
 
 		/* Move blocks to be freed to pool's free list. */
 		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
-		    &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+		    &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
 		    tx);
 		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
 		    DD_USED_HEAD, used, comp, uncomp, tx);
 
 		/* Merge our deadlist into next's and free it. */
 		dsl_deadlist_merge(&ds_next->ds_deadlist,
-		    ds->ds_phys->ds_deadlist_obj, tx);
+		    dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	}
 	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_deadlist_obj = 0;
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
 	/* Collapse range in clone heads */
 	dsl_dataset_remove_clones_key(ds,
-	    ds->ds_phys->ds_creation_txg, tx);
+	    dsl_dataset_phys(ds)->ds_creation_txg, tx);
 
 	if (dsl_dataset_is_snapshot(ds_next)) {
 		dsl_dataset_t *ds_nextnext;
 
 		/*
 		 * Update next's unique to include blocks which
 		 * were previously shared by only this snapshot
 		 * and it.  Those blocks will be born after the
 		 * prev snap and before this snap, and will have
 		 * died after the next snap and before the one
 		 * after that (ie. be on the snap after next's
 		 * deadlist).
 		 */
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext));
+		    dsl_dataset_phys(ds_next)->ds_next_snap_obj,
+		    FTAG, &ds_nextnext));
 		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
-		    ds->ds_phys->ds_prev_snap_txg,
-		    ds->ds_phys->ds_creation_txg,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    dsl_dataset_phys(ds)->ds_creation_txg,
 		    &used, &comp, &uncomp);
-		ds_next->ds_phys->ds_unique_bytes += used;
+		dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
 		dsl_dataset_rele(ds_nextnext, FTAG);
 		ASSERT3P(ds_next->ds_prev, ==, NULL);
 
 		/* Collapse range in this head. */
 		dsl_dataset_t *hds;
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds));
+		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
 		dsl_deadlist_remove_key(&hds->ds_deadlist,
-		    ds->ds_phys->ds_creation_txg, tx);
+		    dsl_dataset_phys(ds)->ds_creation_txg, tx);
 		dsl_dataset_rele(hds, FTAG);
 
 	} else {
 		ASSERT3P(ds_next->ds_prev, ==, ds);
 		dsl_dataset_rele(ds_next->ds_prev, ds_next);
 		ds_next->ds_prev = NULL;
 		if (ds_prev) {
 			VERIFY0(dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 			    ds_next, &ds_next->ds_prev));
 		}
 
 		dsl_dataset_recalc_head_uniq(ds_next);
 
 		/*
 		 * Reduce the amount of our unconsumed refreservation
 		 * being charged to our parent by the amount of
 		 * new unique data we have gained.
 		 */
 		if (old_unique < ds_next->ds_reserved) {
 			int64_t mrsdelta;
 			uint64_t new_unique =
-			    ds_next->ds_phys->ds_unique_bytes;
+			    dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
 			ASSERT(old_unique <= new_unique);
 			mrsdelta = MIN(new_unique - old_unique,
 			    ds_next->ds_reserved - old_unique);
 			dsl_dir_diduse_space(ds->ds_dir,
 			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
 		}
 	}
 	dsl_dataset_rele(ds_next, FTAG);
 
 	/*
 	 * This must be done after the dsl_traverse(), because it will
 	 * re-open the objset.
 	 */
 	if (ds->ds_objset) {
 		dmu_objset_evict(ds->ds_objset);
 		ds->ds_objset = NULL;
 	}
 
 	/* remove from snapshot namespace */
 	dsl_dataset_t *ds_head;
-	ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
 	VERIFY0(dsl_dataset_hold_obj(dp,
-	    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
+	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
 	VERIFY0(dsl_dataset_get_snapname(ds));
 #ifdef ZFS_DEBUG
 	{
 		uint64_t val;
 
 		err = dsl_dataset_snap_lookup(ds_head,
 		    ds->ds_snapname, &val);
 		ASSERT0(err);
 		ASSERT3U(val, ==, obj);
 	}
 #endif
 	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
 	dsl_dataset_rele(ds_head, FTAG);
 
 	if (ds_prev != NULL)
 		dsl_dataset_rele(ds_prev, FTAG);
 
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
-	if (ds->ds_phys->ds_next_clones_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 		uint64_t count;
 		ASSERT0(zap_count(mos,
-		    ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
+		    count == 0);
 		VERIFY0(dmu_object_free(mos,
-		    ds->ds_phys->ds_next_clones_obj, tx));
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
 	}
-	if (ds->ds_phys->ds_props_obj != 0)
-		VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
-	if (ds->ds_phys->ds_userrefs_obj != 0)
-		VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
+	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
+		    tx));
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    tx));
 	dsl_dir_rele(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
 	dmu_object_free_zapified(mos, obj, tx);
 }
 
 static void
 dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_snapshots_destroy_arg_t *dsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 
 	for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) {
 		dsl_dataset_t *ds;
 
 		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
 
 		dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 }
 
 /*
  * The semantics of this function are described in the comment above
  * lzc_destroy_snaps().  To summarize:
  *
  * The snapshots must all be in the same pool.
  *
  * Snapshots that don't exist will be silently ignored (considered to be
  * "already deleted").
  *
  * On success, all snaps will be destroyed and this will return 0.
  * On failure, no snaps will be destroyed, the errlist will be filled in,
  * and this will return an errno.
  */
 int
 dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
     nvlist_t *errlist)
 {
 	dmu_snapshots_destroy_arg_t dsda;
 	int error;
 	nvpair_t *pair;
 
 	pair = nvlist_next_nvpair(snaps, NULL);
 	if (pair == NULL)
 		return (0);
 
 	dsda.dsda_snaps = snaps;
 	dsda.dsda_successful_snaps = fnvlist_alloc();
 	dsda.dsda_defer = defer;
 	dsda.dsda_errlist = errlist;
 
 	error = dsl_sync_task(nvpair_name(pair),
 	    dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
 	    &dsda, 0, ZFS_SPACE_CHECK_NONE);
 	fnvlist_free(dsda.dsda_successful_snaps);
 
 	return (error);
 }
 
 int
 dsl_destroy_snapshot(const char *name, boolean_t defer)
 {
 	int error;
 	nvlist_t *nvl = fnvlist_alloc();
 	nvlist_t *errlist = fnvlist_alloc();
 
 	fnvlist_add_boolean(nvl, name);
 	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
 	fnvlist_free(errlist);
 	fnvlist_free(nvl);
 	return (error);
 }
 
 struct killarg {
 	dsl_dataset_t *ds;
 	dmu_tx_t *tx;
 };
 
 /* ARGSUSED */
 static int
 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct killarg *ka = arg;
 	dmu_tx_t *tx = ka->tx;
 
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (zb->zb_level == ZB_ZIL_LEVEL) {
 		ASSERT(zilog != NULL);
 		/*
 		 * It's a block in the intent log.  It has no
 		 * accounting, so just free it.
 		 */
 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
 		ASSERT(zilog == NULL);
-		ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+		ASSERT3U(bp->blk_birth, >,
+		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
 
 	return (0);
 }
 
 static void
 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	struct killarg ka;
 
 	/*
 	 * Free everything that we point to (that's born after
 	 * the previous snapshot, if we are a clone)
 	 *
 	 * NB: this should be very quick, because we already
 	 * freed all the objects in open context.
 	 */
 	ka.ds = ds;
 	ka.tx = tx;
 	VERIFY0(traverse_dataset(ds,
-	    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
 	    kill_blkptr, &ka));
-	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
+	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+	    dsl_dataset_phys(ds)->ds_unique_bytes == 0);
 }
 
 typedef struct dsl_destroy_head_arg {
 	const char *ddha_name;
 } dsl_destroy_head_arg_t;
 
 int
 dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
 {
 	int error;
 	uint64_t count;
 	objset_t *mos;
 
 	ASSERT(!dsl_dataset_is_snapshot(ds));
 	if (dsl_dataset_is_snapshot(ds))
 		return (SET_ERROR(EINVAL));
 
 	if (refcount_count(&ds->ds_longholds) != expected_holds)
 		return (SET_ERROR(EBUSY));
 
 	mos = ds->ds_dir->dd_pool->dp_meta_objset;
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
 	 * (Except if the only snapshots are from the branch we cloned
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
-	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Can't delete if there are children of this fs.
 	 */
 	error = zap_count(mos,
-	    ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
 	if (error != 0)
 		return (error);
 	if (count != 0)
 		return (SET_ERROR(EEXIST));
 
 	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
-	    ds->ds_prev->ds_phys->ds_num_children == 2 &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
 	    ds->ds_prev->ds_userrefs == 0) {
 		/* We need to remove the origin snapshot as well. */
 		if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
 			return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
 static int
 dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_head_arg_t *ddha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	error = dsl_destroy_head_check_impl(ds, 0);
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 static void
 dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	dd_used_t t;
 
 	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
 
 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
 
-	ASSERT0(dd->dd_phys->dd_head_dataset_obj);
+	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
 
 	/*
 	 * Decrement the filesystem count for all parent filesystems.
 	 *
 	 * When we receive an incremental stream into a filesystem that already
 	 * exists, a temporary clone is created.  We never count this temporary
 	 * clone, whose name begins with a '%'.
 	 */
 	if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
 		dsl_fs_ss_count_adjust(dd->dd_parent, -1,
 		    DD_FIELD_FILESYSTEM_COUNT, tx);
 
 	/*
 	 * Remove our reservation. The impl() routine avoids setting the
 	 * actual property, which would require the (already destroyed) ds.
 	 */
 	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 
-	ASSERT0(dd->dd_phys->dd_used_bytes);
-	ASSERT0(dd->dd_phys->dd_reserved);
+	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
+	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
 	for (t = 0; t < DD_USED_NUM; t++)
-		ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
+		ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
 
-	VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
-	VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
-	VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
+	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
 	VERIFY0(zap_remove(mos,
-	    dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
+	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+	    dd->dd_myname, tx));
 
 	dsl_dir_rele(dd, FTAG);
 	dmu_object_free_zapified(mos, ddobj, tx);
 }
 
 void
 dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t obj, ddobj, prevobj = 0;
 	boolean_t rmorigin;
 
-	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 	ASSERT(ds->ds_prev == NULL ||
-	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
-	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	/* We need to log before removing it from the namespace. */
 	spa_history_log_internal_ds(ds, "destroy", tx, "");
 
 	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
 	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
-	    ds->ds_prev->ds_phys->ds_num_children == 2 &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
 	    ds->ds_prev->ds_userrefs == 0);
 
 	/* Remove our reservation. */
 	if (ds->ds_reserved != 0) {
 		dsl_dataset_set_refreservation_sync_impl(ds,
 		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
 		    0, tx);
 		ASSERT0(ds->ds_reserved);
 	}
 
 	if (ds->ds_large_blocks)
 		spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
 
 	dsl_scan_ds_destroyed(ds, tx);
 
 	obj = ds->ds_object;
 
-	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		/* This is a clone */
 		ASSERT(ds->ds_prev != NULL);
-		ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj);
-		ASSERT0(ds->ds_phys->ds_next_snap_obj);
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
+		    obj);
+		ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
 
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-		if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) {
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    obj, tx);
 		}
 
-		ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1);
-		ds->ds_prev->ds_phys->ds_num_children--;
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
+		dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
 	}
 
 	/*
 	 * Destroy the deadlist.  Unless it's a clone, the
 	 * deadlist should be empty.  (If it's a clone, it's
 	 * safe to ignore the deadlist contents.)
 	 */
 	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_deadlist_obj = 0;
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
 	objset_t *os;
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		old_synchronous_dataset_destroy(ds, tx);
 	} else {
 		/*
 		 * Move the bptree into the pool's list of trees to
 		 * clean up and update space accounting information.
 		 */
 		uint64_t used, comp, uncomp;
 
 		zil_destroy_sync(dmu_objset_zil(os), tx);
 
 		if (!spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_ASYNC_DESTROY)) {
 			dsl_scan_t *scn = dp->dp_scan;
 			spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
 			    tx);
 			dp->dp_bptree_obj = bptree_alloc(mos, tx);
 			VERIFY0(zap_add(mos,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 			    &dp->dp_bptree_obj, tx));
 			ASSERT(!scn->scn_async_destroying);
 			scn->scn_async_destroying = B_TRUE;
 		}
 
-		used = ds->ds_dir->dd_phys->dd_used_bytes;
-		comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
-		uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
+		used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+		comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+		uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
 
 		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-		    ds->ds_phys->ds_unique_bytes == used);
+		    dsl_dataset_phys(ds)->ds_unique_bytes == used);
 
 		bptree_add(mos, dp->dp_bptree_obj,
-		    &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
+		    &dsl_dataset_phys(ds)->ds_bp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 		    used, comp, uncomp, tx);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 		    -used, -comp, -uncomp, tx);
 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 		    used, comp, uncomp, tx);
 	}
 
 	if (ds->ds_prev != NULL) {
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			VERIFY0(zap_remove_int(mos,
-			    ds->ds_prev->ds_dir->dd_phys->dd_clones,
+			    dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
 			    ds->ds_object, tx));
 		}
 		prevobj = ds->ds_prev->ds_object;
 		dsl_dataset_rele(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
 	/*
 	 * This must be done after the dsl_traverse(), because it will
 	 * re-open the objset.
 	 */
 	if (ds->ds_objset) {
 		dmu_objset_evict(ds->ds_objset);
 		ds->ds_objset = NULL;
 	}
 
 	/* Erase the link in the dir */
 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-	ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
+	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
 	ddobj = ds->ds_dir->dd_object;
-	ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
-	VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx));
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
+	VERIFY0(zap_destroy(mos,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
 
 	if (ds->ds_bookmarks != 0) {
-		VERIFY0(zap_destroy(mos,
-		    ds->ds_bookmarks, tx));
+		VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
 		spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
 	}
 
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
-	ASSERT0(ds->ds_phys->ds_next_clones_obj);
-	ASSERT0(ds->ds_phys->ds_props_obj);
-	ASSERT0(ds->ds_phys->ds_userrefs_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
 	dsl_dir_rele(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
 	dmu_object_free_zapified(mos, obj, tx);
 
 	dsl_dir_destroy_sync(ddobj, tx);
 
 	if (rmorigin) {
 		dsl_dataset_t *prev;
 		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
 		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
 		dsl_dataset_rele(prev, FTAG);
 	}
 }
 
 static void
 dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_head_arg_t *ddha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
 	dsl_destroy_head_sync_impl(ds, tx);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 static void
 dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_head_arg_t *ddha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
 
 	/* Mark it as inconsistent on-disk, in case we crash */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_destroy_head(const char *name)
 {
 	dsl_destroy_head_arg_t ddha;
 	int error;
 	spa_t *spa;
 	boolean_t isenabled;
 
 #ifdef _KERNEL
 	zfs_destroy_unmount_origin(name);
 #endif
 
 	error = spa_open(name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
 	spa_close(spa, FTAG);
 
 	ddha.ddha_name = name;
 
 	if (!isenabled) {
 		objset_t *os;
 
 		error = dsl_sync_task(name, dsl_destroy_head_check,
 		    dsl_destroy_head_begin_sync, &ddha,
 		    0, ZFS_SPACE_CHECK_NONE);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Head deletion is processed in one txg on old pools;
 		 * remove the objects from open context so that the txg sync
 		 * is not too long.
 		 */
 		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
 		if (error == 0) {
 			uint64_t prev_snap_txg =
-			    dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg;
+			    dsl_dataset_phys(dmu_objset_ds(os))->
+			    ds_prev_snap_txg;
 			for (uint64_t obj = 0; error == 0;
 			    error = dmu_object_next(os, &obj, FALSE,
 			    prev_snap_txg))
 				(void) dmu_free_long_object(os, obj);
 			/* sync out all frees */
 			txg_wait_synced(dmu_objset_pool(os), 0);
 			dmu_objset_disown(os, FTAG);
 		}
 	}
 
 	return (dsl_sync_task(name, dsl_destroy_head_check,
 	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
  * Note, this function is used as the callback for dmu_objset_find().  We
  * always return 0 so that we will continue to find and process
  * inconsistent datasets, even if we encounter an error trying to
  * process one of them.
  */
 /* ARGSUSED */
 int
 dsl_destroy_inconsistent(const char *dsname, void *arg)
 {
 	objset_t *os;
 
 	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
 		boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
 		dmu_objset_rele(os, FTAG);
 		if (inconsistent)
 			(void) dsl_destroy_head(dsname);
 	}
 	return (0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c	(revision 275782)
@@ -1,1971 +1,1982 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_impl.h>
 #include <sys/spa.h>
 #include <sys/metaslab.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
 #include <sys/zvol.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 #include <sys/zfeature.h>
 #include <sys/policy.h>
 #include <sys/zfs_znode.h>
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 
 /*
  * Filesystem and Snapshot Limits
  * ------------------------------
  *
  * These limits are used to restrict the number of filesystems and/or snapshots
  * that can be created at a given level in the tree or below. A typical
  * use-case is with a delegated dataset where the administrator wants to ensure
  * that a user within the zone is not creating too many additional filesystems
  * or snapshots, even though they're not exceeding their space quota.
  *
  * The filesystem and snapshot counts are stored as extensible properties. This
  * capability is controlled by a feature flag and must be enabled to be used.
  * Once enabled, the feature is not active until the first limit is set. At
  * that point, future operations to create/destroy filesystems or snapshots
  * will validate and update the counts.
  *
  * Because the count properties will not exist before the feature is active,
  * the counts are updated when a limit is first set on an uninitialized
  * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
  * all of the nested filesystems/snapshots. Thus, a new leaf node has a
  * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
  * snapshot count properties on a node indicate uninitialized counts on that
  * node.) When first setting a limit on an uninitialized node, the code starts
  * at the filesystem with the new limit and descends into all sub-filesystems
  * to add the count properties.
  *
  * In practice this is lightweight since a limit is typically set when the
  * filesystem is created and thus has no children. Once valid, changing the
  * limit value won't require a re-traversal since the counts are already valid.
  * When recursively fixing the counts, if a node with a limit is encountered
  * during the descent, the counts are known to be valid and there is no need to
  * descend into that filesystem's children. The counts on filesystems above the
  * one with the new limit will still be uninitialized, unless a limit is
  * eventually set on one of those filesystems. The counts are always recursively
  * updated when a limit is set on a dataset, unless there is already a limit.
  * When a new limit value is set on a filesystem with an existing limit, it is
  * possible for the new limit to be less than the current count at that level
  * since a user who can change the limit is also allowed to exceed the limit.
  *
  * Once the feature is active, then whenever a filesystem or snapshot is
  * created, the code recurses up the tree, validating the new count against the
  * limit at each initialized level. In practice, most levels will not have a
  * limit set. If there is a limit at any initialized level up the tree, the
  * check must pass or the creation will fail. Likewise, when a filesystem or
  * snapshot is destroyed, the counts are recursively adjusted all the way up
  * the initizized nodes in the tree. Renaming a filesystem into different point
  * in the tree will first validate, then update the counts on each branch up to
  * the common ancestor. A receive will also validate the counts and then update
  * them.
  *
  * An exception to the above behavior is that the limit is not enforced if the
  * user has permission to modify the limit. This is primarily so that
  * recursive snapshots in the global zone always work. We want to prevent a
  * denial-of-service in which a lower level delegated dataset could max out its
  * limit and thus block recursive snapshots from being taken in the global zone.
  * Because of this, it is possible for the snapshot count to be over the limit
  * and snapshots taken in the global zone could cause a lower level dataset to
  * hit or exceed its limit. The administrator taking the global zone recursive
  * snapshot should be aware of this side-effect and behave accordingly.
  * For consistency, the filesystem limit is also not enforced if the user can
  * modify the limit.
  *
  * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
  * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
  * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
  * dsl_dir_init_fs_ss_count().
  *
  * There is a special case when we receive a filesystem that already exists. In
  * this case a temporary clone name of %X is created (see dmu_recv_begin). We
  * never update the filesystem counts for temporary clones.
  *
  * Likewise, we do not update the snapshot counts for temporary snapshots,
  * such as those created by zfs diff.
  */
 
+extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
+
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 
 /* ARGSUSED */
 static void
 dsl_dir_evict(dmu_buf_t *db, void *arg)
 {
 	dsl_dir_t *dd = arg;
 	dsl_pool_t *dp = dd->dd_pool;
 	int t;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
 		ASSERT(dd->dd_tempreserved[t] == 0);
 		ASSERT(dd->dd_space_towrite[t] == 0);
 	}
 
 	if (dd->dd_parent)
 		dsl_dir_rele(dd->dd_parent, dd);
 
 	spa_close(dd->dd_pool->dp_spa, dd);
 
 	/*
 	 * The props callback list should have been cleaned up by
 	 * objset_evict().
 	 */
 	list_destroy(&dd->dd_prop_cbs);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 }
 
 int
 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **ddp)
 {
 	dmu_buf_t *dbuf;
 	dsl_dir_t *dd;
 	int err;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
 	if (err != 0)
 		return (err);
 	dd = dmu_buf_get_user(dbuf);
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(dbuf, &doi);
 		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
 		ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
 	}
 #endif
 	if (dd == NULL) {
 		dsl_dir_t *winner;
 
 		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
 		dd->dd_object = ddobj;
 		dd->dd_dbuf = dbuf;
 		dd->dd_pool = dp;
-		dd->dd_phys = dbuf->db_data;
 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 
 		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
 		    offsetof(dsl_prop_cb_record_t, cbr_node));
 
 		dsl_dir_snap_cmtime_update(dd);
 
-		if (dd->dd_phys->dd_parent_obj) {
-			err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
-			    NULL, dd, &dd->dd_parent);
+		if (dsl_dir_phys(dd)->dd_parent_obj) {
+			err = dsl_dir_hold_obj(dp,
+			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
+			    &dd->dd_parent);
 			if (err != 0)
 				goto errout;
 			if (tail) {
 #ifdef ZFS_DEBUG
 				uint64_t foundobj;
 
 				err = zap_lookup(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
-				    tail, sizeof (foundobj), 1, &foundobj);
+				    dsl_dir_phys(dd->dd_parent)->
+				    dd_child_dir_zapobj, tail,
+				    sizeof (foundobj), 1, &foundobj);
 				ASSERT(err || foundobj == ddobj);
 #endif
 				(void) strcpy(dd->dd_myname, tail);
 			} else {
 				err = zap_value_search(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+				    dsl_dir_phys(dd->dd_parent)->
+				    dd_child_dir_zapobj,
 				    ddobj, 0, dd->dd_myname);
 			}
 			if (err != 0)
 				goto errout;
 		} else {
 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
 		}
 
 		if (dsl_dir_is_clone(dd)) {
 			dmu_buf_t *origin_bonus;
 			dsl_dataset_phys_t *origin_phys;
 
 			/*
 			 * We can't open the origin dataset, because
 			 * that would require opening this dsl_dir.
 			 * Just look at its phys directly instead.
 			 */
 			err = dmu_bonus_hold(dp->dp_meta_objset,
-			    dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
+			    &origin_bonus);
 			if (err != 0)
 				goto errout;
 			origin_phys = origin_bonus->db_data;
 			dd->dd_origin_txg =
 			    origin_phys->ds_creation_txg;
 			dmu_buf_rele(origin_bonus, FTAG);
 		}
 
-		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
-		    dsl_dir_evict);
+		winner = dmu_buf_set_user_ie(dbuf, dd, dsl_dir_evict);
 		if (winner) {
 			if (dd->dd_parent)
 				dsl_dir_rele(dd->dd_parent, dd);
 			mutex_destroy(&dd->dd_lock);
 			kmem_free(dd, sizeof (dsl_dir_t));
 			dd = winner;
 		} else {
 			spa_open_ref(dp->dp_spa, dd);
 		}
 	}
 
 	/*
 	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
 	 * holds on the spa.  We need the open-to-close holds because
 	 * otherwise the spa_refcnt wouldn't change when we open a
 	 * dir which the spa also has open, so we could incorrectly
 	 * think it was OK to unload/export/destroy the pool.  We need
 	 * the instantiate-to-evict hold because the dsl_dir_t has a
 	 * pointer to the dd_pool, which has a pointer to the spa_t.
 	 */
 	spa_open_ref(dp->dp_spa, tag);
 	ASSERT3P(dd->dd_pool, ==, dp);
 	ASSERT3U(dd->dd_object, ==, ddobj);
 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
 	*ddp = dd;
 	return (0);
 
 errout:
 	if (dd->dd_parent)
 		dsl_dir_rele(dd->dd_parent, dd);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 	dmu_buf_rele(dbuf, tag);
 	return (err);
 }
 
 void
 dsl_dir_rele(dsl_dir_t *dd, void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_close(dd->dd_pool->dp_spa, tag);
 	dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
 void
 dsl_dir_name(dsl_dir_t *dd, char *buf)
 {
 	if (dd->dd_parent) {
 		dsl_dir_name(dd->dd_parent, buf);
 		(void) strcat(buf, "/");
 	} else {
 		buf[0] = '\0';
 	}
 	if (!MUTEX_HELD(&dd->dd_lock)) {
 		/*
 		 * recursive mutex so that we can use
 		 * dprintf_dd() with dd_lock held
 		 */
 		mutex_enter(&dd->dd_lock);
 		(void) strcat(buf, dd->dd_myname);
 		mutex_exit(&dd->dd_lock);
 	} else {
 		(void) strcat(buf, dd->dd_myname);
 	}
 }
 
 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 int
 dsl_dir_namelen(dsl_dir_t *dd)
 {
 	int result = 0;
 
 	if (dd->dd_parent) {
 		/* parent's name + 1 for the "/" */
 		result = dsl_dir_namelen(dd->dd_parent) + 1;
 	}
 
 	if (!MUTEX_HELD(&dd->dd_lock)) {
 		/* see dsl_dir_name */
 		mutex_enter(&dd->dd_lock);
 		result += strlen(dd->dd_myname);
 		mutex_exit(&dd->dd_lock);
 	} else {
 		result += strlen(dd->dd_myname);
 	}
 
 	return (result);
 }
 
 static int
 getcomponent(const char *path, char *component, const char **nextp)
 {
 	char *p;
 
 	if ((path == NULL) || (path[0] == '\0'))
 		return (SET_ERROR(ENOENT));
 	/* This would be a good place to reserve some namespace... */
 	p = strpbrk(path, "/@");
 	if (p && (p[1] == '/' || p[1] == '@')) {
 		/* two separators in a row */
 		return (SET_ERROR(EINVAL));
 	}
 	if (p == NULL || p == path) {
 		/*
 		 * if the first thing is an @ or /, it had better be an
 		 * @ and it had better not have any more ats or slashes,
 		 * and it had better have something after the @.
 		 */
 		if (p != NULL &&
 		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
 			return (SET_ERROR(EINVAL));
 		if (strlen(path) >= MAXNAMELEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strcpy(component, path);
 		p = NULL;
 	} else if (p[0] == '/') {
 		if (p - path >= MAXNAMELEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strncpy(component, path, p - path);
 		component[p - path] = '\0';
 		p++;
 	} else if (p[0] == '@') {
 		/*
 		 * if the next separator is an @, there better not be
 		 * any more slashes.
 		 */
 		if (strchr(path, '/'))
 			return (SET_ERROR(EINVAL));
 		if (p - path >= MAXNAMELEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strncpy(component, path, p - path);
 		component[p - path] = '\0';
 	} else {
 		panic("invalid p=%p", (void *)p);
 	}
 	*nextp = p;
 	return (0);
 }
 
 /*
  * Return the dsl_dir_t, and possibly the last component which couldn't
  * be found in *tail.  The name must be in the specified dsl_pool_t.  This
  * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
  * path is bogus, or if tail==NULL and we couldn't parse the whole name.
  * (*tail)[0] == '@' means that the last component is a snapshot.
  */
 int
 dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dir_t **ddp, const char **tailp)
 {
 	char buf[MAXNAMELEN];
 	const char *spaname, *next, *nextnext = NULL;
 	int err;
 	dsl_dir_t *dd;
 	uint64_t ddobj;
 
 	err = getcomponent(name, buf, &next);
 	if (err != 0)
 		return (err);
 
 	/* Make sure the name is in the specified pool. */
 	spaname = spa_name(dp->dp_spa);
 	if (strcmp(buf, spaname) != 0)
 		return (SET_ERROR(EINVAL));
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
 	if (err != 0) {
 		return (err);
 	}
 
 	while (next != NULL) {
 		dsl_dir_t *child_ds;
 		err = getcomponent(next, buf, &nextnext);
 		if (err != 0)
 			break;
 		ASSERT(next[0] != '\0');
 		if (next[0] == '@')
 			break;
 		dprintf("looking up %s in obj%lld\n",
-		    buf, dd->dd_phys->dd_child_dir_zapobj);
+		    buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 
 		err = zap_lookup(dp->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj,
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
 		    buf, sizeof (ddobj), 1, &ddobj);
 		if (err != 0) {
 			if (err == ENOENT)
 				err = 0;
 			break;
 		}
 
 		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
 		if (err != 0)
 			break;
 		dsl_dir_rele(dd, tag);
 		dd = child_ds;
 		next = nextnext;
 	}
 
 	if (err != 0) {
 		dsl_dir_rele(dd, tag);
 		return (err);
 	}
 
 	/*
 	 * It's an error if there's more than one component left, or
 	 * tailp==NULL and there's any component left.
 	 */
 	if (next != NULL &&
 	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 		/* bad path name */
 		dsl_dir_rele(dd, tag);
 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
 		err = SET_ERROR(ENOENT);
 	}
 	if (tailp != NULL)
 		*tailp = next;
 	*ddp = dd;
 	return (err);
 }
 
 /*
  * If the counts are already initialized for this filesystem and its
  * descendants then do nothing, otherwise initialize the counts.
  *
  * The counts on this filesystem, and those below, may be uninitialized due to
  * either the use of a pre-existing pool which did not support the
  * filesystem/snapshot limit feature, or one in which the feature had not yet
  * been enabled.
  *
  * Recursively descend the filesystem tree and update the filesystem/snapshot
  * counts on each filesystem below, then update the cumulative count on the
  * current filesystem. If the filesystem already has a count set on it,
  * then we know that its counts, and the counts on the filesystems below it,
  * are already correct, so we don't have to update this filesystem.
  */
 static void
 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	uint64_t my_fs_cnt = 0;
 	uint64_t my_ss_cnt = 0;
 	dsl_pool_t *dp = dd->dd_pool;
 	objset_t *os = dp->dp_meta_objset;
 	zap_cursor_t *zc;
 	zap_attribute_t *za;
 	dsl_dataset_t *ds;
 
 	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
 	ASSERT(dsl_pool_config_held(dp));
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dsl_dir_zapify(dd, tx);
 
 	/*
 	 * If the filesystem count has already been initialized then we
 	 * don't need to recurse down any further.
 	 */
 	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
 		return;
 
 	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/* Iterate my child dirs */
-	for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
+	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
 		dsl_dir_t *chld_dd;
 		uint64_t count;
 
 		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
 		    &chld_dd));
 
 		/*
 		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
 		 * temporary datasets.
 		 */
 		if (chld_dd->dd_myname[0] == '$' ||
 		    chld_dd->dd_myname[0] == '%') {
 			dsl_dir_rele(chld_dd, FTAG);
 			continue;
 		}
 
 		my_fs_cnt++;	/* count this child */
 
 		dsl_dir_init_fs_ss_count(chld_dd, tx);
 
 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
 		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
 		my_fs_cnt += count;
 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
 		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
 		my_ss_cnt += count;
 
 		dsl_dir_rele(chld_dd, FTAG);
 	}
 	zap_cursor_fini(zc);
 	/* Count my snapshots (we counted children's snapshots above) */
 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
+	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
 
-	for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
+	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 		/* Don't count temporary snapshots */
 		if (za->za_name[0] != '%')
 			my_ss_cnt++;
 	}
 	zap_cursor_fini(zc);
 
 	dsl_dataset_rele(ds, FTAG);
 
 	kmem_free(zc, sizeof (zap_cursor_t));
 	kmem_free(za, sizeof (zap_attribute_t));
 
 	/* we're in a sync task, update counts */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
 	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
 	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
 }
 
 static int
 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
 {
 	char *ddname = (char *)arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	dd = ds->ds_dir;
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
 	    dsl_dir_is_zapified(dd) &&
 	    zap_contains(dp->dp_meta_objset, dd->dd_object,
 	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EALREADY));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
 {
 	char *ddname = (char *)arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	spa_t *spa;
 
 	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
 
 	spa = dsl_dataset_get_spa(ds);
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
 		/*
 		 * Since the feature was not active and we're now setting a
 		 * limit, increment the feature-active counter so that the
 		 * feature becomes active for the first time.
 		 *
 		 * We are already in a sync task so we can update the MOS.
 		 */
 		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
 	}
 
 	/*
 	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
 	 * we need to ensure the counts are correct. Descend down the tree from
 	 * this point and update all of the counts to be accurate.
 	 */
 	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
 
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * Make sure the feature is enabled and activate it if necessary.
  * Since we're setting a limit, ensure the on-disk counts are valid.
  * This is only called by the ioctl path when setting a limit value.
  *
  * We do not need to validate the new limit, since users who can change the
  * limit are also allowed to exceed the limit.
  */
 int
 dsl_dir_activate_fs_ss_limit(const char *ddname)
 {
 	int error;
 
 	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
 	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
 	    ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == EALREADY)
 		error = 0;
 
 	return (error);
 }
 
 /*
  * Used to determine if the filesystem_limit or snapshot_limit should be
  * enforced. We allow the limit to be exceeded if the user has permission to
  * write the property value. We pass in the creds that we got in the open
  * context since we will always be the GZ root in syncing context. We also have
  * to handle the case where we are allowed to change the limit on the current
  * dataset, but there may be another limit in the tree above.
  *
  * We can never modify these two properties within a non-global zone. In
  * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
  * can't use that function since we are already holding the dp_config_rwlock.
  * In addition, we already have the dd and dealing with snapshots is simplified
  * in this code.
  */
 
 typedef enum {
 	ENFORCE_ALWAYS,
 	ENFORCE_NEVER,
 	ENFORCE_ABOVE
 } enforce_res_t;
 
 static enforce_res_t
 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
 {
 	enforce_res_t enforce = ENFORCE_ALWAYS;
 	uint64_t obj;
 	dsl_dataset_t *ds;
 	uint64_t zoned;
 
 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
 
 #ifdef _KERNEL
 #ifdef __FreeBSD__
 	if (jailed(cr))
 #else
 	if (crgetzoneid(cr) != GLOBAL_ZONEID)
 #endif
 		return (ENFORCE_ALWAYS);
 
 	if (secpolicy_zfs(cr) == 0)
 		return (ENFORCE_NEVER);
 #endif
 
-	if ((obj = dd->dd_phys->dd_head_dataset_obj) == 0)
+	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
 		return (ENFORCE_ALWAYS);
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 
 	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
 		return (ENFORCE_ALWAYS);
 
 	if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
 		/* Only root can access zoned fs's from the GZ */
 		enforce = ENFORCE_ALWAYS;
 	} else {
 		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
 			enforce = ENFORCE_ABOVE;
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (enforce);
 }
 
 /*
  * Check if adding additional child filesystem(s) would exceed any filesystem
  * limits or adding additional snapshot(s) would exceed any snapshot limits.
  * The prop argument indicates which limit to check.
  *
  * Note that all filesystem limits up to the root (or the highest
  * initialized) filesystem or the given ancestor must be satisfied.
  */
 int
 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
     dsl_dir_t *ancestor, cred_t *cr)
 {
 	objset_t *os = dd->dd_pool->dp_meta_objset;
 	uint64_t limit, count;
 	char *count_prop;
 	enforce_res_t enforce;
 	int err = 0;
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
 
 	/*
 	 * If we're allowed to change the limit, don't enforce the limit
 	 * e.g. this can happen if a snapshot is taken by an administrative
 	 * user in the global zone (i.e. a recursive snapshot by root).
 	 * However, we must handle the case of delegated permissions where we
 	 * are allowed to change the limit on the current dataset, but there
 	 * is another limit in the tree above.
 	 */
 	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
 	if (enforce == ENFORCE_NEVER)
 		return (0);
 
 	/*
 	 * e.g. if renaming a dataset with no snapshots, count adjustment
 	 * is 0.
 	 */
 	if (delta == 0)
 		return (0);
 
 	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
 		/*
 		 * We don't enforce the limit for temporary snapshots. This is
 		 * indicated by a NULL cred_t argument.
 		 */
 		if (cr == NULL)
 			return (0);
 
 		count_prop = DD_FIELD_SNAPSHOT_COUNT;
 	} else {
 		count_prop = DD_FIELD_FILESYSTEM_COUNT;
 	}
 
 	/*
 	 * If an ancestor has been provided, stop checking the limit once we
 	 * hit that dir. We need this during rename so that we don't overcount
 	 * the check once we recurse up to the common ancestor.
 	 */
 	if (ancestor == dd)
 		return (0);
 
 	/*
 	 * If we hit an uninitialized node while recursing up the tree, we can
 	 * stop since we know there is no limit here (or above). The counts are
 	 * not valid on this node and we know we won't touch this node's counts.
 	 */
 	if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
 	    count_prop, sizeof (count), 1, &count) == ENOENT)
 		return (0);
 
 	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
 	    B_FALSE);
 	if (err != 0)
 		return (err);
 
 	/* Is there a limit which we've hit? */
 	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
 		return (SET_ERROR(EDQUOT));
 
 	if (dd->dd_parent != NULL)
 		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
 		    ancestor, cr);
 
 	return (err);
 }
 
 /*
  * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
  * parents. When a new filesystem/snapshot is created, increment the count on
  * all parents, and when a filesystem/snapshot is destroyed, decrement the
  * count.
  */
 void
 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
     dmu_tx_t *tx)
 {
 	int err;
 	objset_t *os = dd->dd_pool->dp_meta_objset;
 	uint64_t count;
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
 	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
 
 	/*
 	 * When we receive an incremental stream into a filesystem that already
 	 * exists, a temporary clone is created.  We don't count this temporary
 	 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
 	 * $MOS & $ORIGIN) objsets.
 	 */
 	if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
 	    strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
 		return;
 
 	/*
 	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
 	 */
 	if (delta == 0)
 		return;
 
 	/*
 	 * If we hit an uninitialized node while recursing up the tree, we can
 	 * stop since we know the counts are not valid on this node and we
 	 * know we shouldn't touch this node's counts. An uninitialized count
 	 * on the node indicates that either the feature has not yet been
 	 * activated or there are no limits on this part of the tree.
 	 */
 	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
 	    prop, sizeof (count), 1, &count)) == ENOENT)
 		return;
 	VERIFY0(err);
 
 	count += delta;
 	/* Use a signed verify to make sure we're not neg. */
 	VERIFY3S(count, >=, 0);
 
 	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
 	    tx));
 
 	/* Roll up this additional count into our ancestors */
 	if (dd->dd_parent != NULL)
 		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
 }
 
 uint64_t
 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
     dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t ddobj;
 	dsl_dir_phys_t *ddphys;
 	dmu_buf_t *dbuf;
 
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 	if (pds) {
-		VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+		VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
 		    name, sizeof (uint64_t), 1, &ddobj, tx));
 	} else {
 		/* it's the root dir */
 		VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 	}
 	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	ddphys = dbuf->db_data;
 
 	ddphys->dd_creation_time = gethrestime_sec();
 	if (pds) {
 		ddphys->dd_parent_obj = pds->dd_object;
 
 		/* update the filesystem counts */
 		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
 	}
 	ddphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	ddphys->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
 		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 	dmu_buf_rele(dbuf, FTAG);
 
 	return (ddobj);
 }
 
 boolean_t
 dsl_dir_is_clone(dsl_dir_t *dd)
 {
-	return (dd->dd_phys->dd_origin_obj &&
+	return (dsl_dir_phys(dd)->dd_origin_obj &&
 	    (dd->dd_pool->dp_origin_snap == NULL ||
-	    dd->dd_phys->dd_origin_obj !=
+	    dsl_dir_phys(dd)->dd_origin_obj !=
 	    dd->dd_pool->dp_origin_snap->ds_object));
 }
 
 void
 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 {
 	mutex_enter(&dd->dd_lock);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-	    dd->dd_phys->dd_used_bytes);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
+	    dsl_dir_phys(dd)->dd_used_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+	    dsl_dir_phys(dd)->dd_quota);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
-	    dd->dd_phys->dd_reserved);
+	    dsl_dir_phys(dd)->dd_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-	    dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
-	    (dd->dd_phys->dd_uncompressed_bytes * 100 /
-	    dd->dd_phys->dd_compressed_bytes));
+	    dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
+	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
+	    dsl_dir_phys(dd)->dd_compressed_bytes));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
-	    dd->dd_phys->dd_uncompressed_bytes);
-	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	    dsl_dir_phys(dd)->dd_uncompressed_bytes);
+	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
-		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
 	}
 	mutex_exit(&dd->dd_lock);
 
 	if (dsl_dir_is_zapified(dd)) {
 		uint64_t count;
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 
 		if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
 		    sizeof (count), 1, &count) == 0) {
 			dsl_prop_nvlist_add_uint64(nv,
 			    ZFS_PROP_FILESYSTEM_COUNT, count);
 		}
 		if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
 		    sizeof (count), 1, &count) == 0) {
 			dsl_prop_nvlist_add_uint64(nv,
 			    ZFS_PROP_SNAPSHOT_COUNT, count);
 		}
 	}
 
 	if (dsl_dir_is_clone(dd)) {
 		dsl_dataset_t *ds;
 		char buf[MAXNAMELEN];
 
 		VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
-		    dd->dd_phys->dd_origin_obj, FTAG, &ds));
+		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
 		dsl_dataset_name(ds, buf);
 		dsl_dataset_rele(ds, FTAG);
 		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
 	}
 }
 
 void
 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dd->dd_pool;
 
-	ASSERT(dd->dd_phys);
+	ASSERT(dsl_dir_phys(dd));
 
 	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(dd->dd_dbuf, dd);
 	}
 }
 
 static int64_t
 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 {
-	uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
-	uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
+	uint64_t new_accounted =
+	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
 	return (new_accounted - old_accounted);
 }
 
 void
 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	mutex_enter(&dd->dd_lock);
 	ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
 	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
 	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
 	mutex_exit(&dd->dd_lock);
 
 	/* release the hold from dsl_dir_dirty */
 	dmu_buf_rele(dd->dd_dbuf, dd);
 }
 
 static uint64_t
 dsl_dir_space_towrite(dsl_dir_t *dd)
 {
 	uint64_t space = 0;
 	int i;
 
 	ASSERT(MUTEX_HELD(&dd->dd_lock));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		space += dd->dd_space_towrite[i&TXG_MASK];
 		ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
 	}
 	return (space);
 }
 
 /*
  * How much space would dd have available if ancestor had delta applied
  * to it?  If ondiskonly is set, we're only interested in what's
  * on-disk, not estimated pending changes.
  */
 uint64_t
 dsl_dir_space_available(dsl_dir_t *dd,
     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
 {
 	uint64_t parentspace, myspace, quota, used;
 
 	/*
 	 * If there are no restrictions otherwise, assume we have
 	 * unlimited space available.
 	 */
 	quota = UINT64_MAX;
 	parentspace = UINT64_MAX;
 
 	if (dd->dd_parent != NULL) {
 		parentspace = dsl_dir_space_available(dd->dd_parent,
 		    ancestor, delta, ondiskonly);
 	}
 
 	mutex_enter(&dd->dd_lock);
-	if (dd->dd_phys->dd_quota != 0)
-		quota = dd->dd_phys->dd_quota;
-	used = dd->dd_phys->dd_used_bytes;
+	if (dsl_dir_phys(dd)->dd_quota != 0)
+		quota = dsl_dir_phys(dd)->dd_quota;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
 	if (!ondiskonly)
 		used += dsl_dir_space_towrite(dd);
 
 	if (dd->dd_parent == NULL) {
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
 		quota = MIN(quota, poolsize);
 	}
 
-	if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
 		/*
 		 * We have some space reserved, in addition to what our
 		 * parent gave us.
 		 */
-		parentspace += dd->dd_phys->dd_reserved - used;
+		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
 	}
 
 	if (dd == ancestor) {
 		ASSERT(delta <= 0);
 		ASSERT(used >= -delta);
 		used += delta;
 		if (parentspace != UINT64_MAX)
 			parentspace -= delta;
 	}
 
 	if (used > quota) {
 		/* over quota */
 		myspace = 0;
 	} else {
 		/*
 		 * the lesser of the space provided by our parent and
 		 * the space left in our quota
 		 */
 		myspace = MIN(parentspace, quota - used);
 	}
 
 	mutex_exit(&dd->dd_lock);
 
 	return (myspace);
 }
 
 struct tempreserve {
 	list_node_t tr_node;
 	dsl_dir_t *tr_ds;
 	uint64_t tr_size;
 };
 
 static int
 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
     boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
     dmu_tx_t *tx, boolean_t first)
 {
 	uint64_t txg = tx->tx_txg;
 	uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
 	uint64_t deferred = 0;
 	struct tempreserve *tr;
 	int retval = EDQUOT;
 	int txgidx = txg & TXG_MASK;
 	int i;
 	uint64_t ref_rsrv = 0;
 
 	ASSERT3U(txg, !=, 0);
 	ASSERT3S(asize, >, 0);
 
 	mutex_enter(&dd->dd_lock);
 
 	/*
 	 * Check against the dsl_dir's quota.  We don't add in the delta
 	 * when checking for over-quota because they get one free hit.
 	 */
 	est_inflight = dsl_dir_space_towrite(dd);
 	for (i = 0; i < TXG_SIZE; i++)
 		est_inflight += dd->dd_tempreserved[i];
-	used_on_disk = dd->dd_phys->dd_used_bytes;
+	used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
 
 	/*
 	 * On the first iteration, fetch the dataset's used-on-disk and
 	 * refreservation values. Also, if checkrefquota is set, test if
 	 * allocating this space would exceed the dataset's refquota.
 	 */
 	if (first && tx->tx_objset) {
 		int error;
 		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
 
 		error = dsl_dataset_check_quota(ds, checkrefquota,
 		    asize, est_inflight, &used_on_disk, &ref_rsrv);
 		if (error) {
 			mutex_exit(&dd->dd_lock);
 			return (error);
 		}
 	}
 
 	/*
 	 * If this transaction will result in a net free of space,
 	 * we want to let it through.
 	 */
-	if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
+	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
 		quota = UINT64_MAX;
 	else
-		quota = dd->dd_phys->dd_quota;
+		quota = dsl_dir_phys(dd)->dd_quota;
 
 	/*
 	 * Adjust the quota against the actual pool size at the root
 	 * minus any outstanding deferred frees.
 	 * To ensure that it's possible to remove files from a full
 	 * pool without inducing transient overcommits, we throttle
 	 * netfree transactions against a quota that is slightly larger,
 	 * but still within the pool's allocation slop.  In cases where
 	 * we're very close to full, this will allow a steady trickle of
 	 * removes to get through.
 	 */
 	if (dd->dd_parent == NULL) {
 		spa_t *spa = dd->dd_pool->dp_spa;
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
 		deferred = metaslab_class_get_deferred(spa_normal_class(spa));
 		if (poolsize - deferred < quota) {
 			quota = poolsize - deferred;
 			retval = ENOSPC;
 		}
 	}
 
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota, they get to try again unless the actual
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
 	if (used_on_disk + est_inflight >= quota) {
 		if (est_inflight > 0 || used_on_disk < quota ||
 		    (retval == ENOSPC && used_on_disk < quota + deferred))
 			retval = ERESTART;
 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 		    "quota=%lluK tr=%lluK err=%d\n",
 		    used_on_disk>>10, est_inflight>>10,
 		    quota>>10, asize>>10, retval);
 		mutex_exit(&dd->dd_lock);
 		return (SET_ERROR(retval));
 	}
 
 	/* We need to up our estimated delta before dropping dd_lock */
 	dd->dd_tempreserved[txgidx] += asize;
 
 	parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
 	    asize - ref_rsrv);
 	mutex_exit(&dd->dd_lock);
 
 	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 	tr->tr_ds = dd;
 	tr->tr_size = asize;
 	list_insert_tail(tr_list, tr);
 
 	/* see if it's OK with our parent */
 	if (dd->dd_parent && parent_rsrv) {
-		boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+		boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
 		return (dsl_dir_tempreserve_impl(dd->dd_parent,
 		    parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
 	} else {
 		return (0);
 	}
 }
 
 /*
  * Reserve space in this dsl_dir, to be used in this tx's txg.
  * After the space has been dirtied (and dsl_dir_willuse_space()
  * has been called), the reservation should be canceled, using
  * dsl_dir_tempreserve_clear().
  */
 int
 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
     uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
 {
 	int err;
 	list_t *tr_list;
 
 	if (asize == 0) {
 		*tr_cookiep = NULL;
 		return (0);
 	}
 
 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 	list_create(tr_list, sizeof (struct tempreserve),
 	    offsetof(struct tempreserve, tr_node));
 	ASSERT3S(asize, >, 0);
 	ASSERT3S(fsize, >=, 0);
 
 	err = arc_tempreserve_space(lsize, tx->tx_txg);
 	if (err == 0) {
 		struct tempreserve *tr;
 
 		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 		tr->tr_size = lsize;
 		list_insert_tail(tr_list, tr);
 	} else {
 		if (err == EAGAIN) {
 			/*
 			 * If arc_memory_throttle() detected that pageout
 			 * is running and we are low on memory, we delay new
 			 * non-pageout transactions to give pageout an
 			 * advantage.
 			 *
 			 * It is unfortunate to be delaying while the caller's
 			 * locks are held.
 			 */
 			txg_delay(dd->dd_pool, tx->tx_txg,
 			    MSEC2NSEC(10), MSEC2NSEC(10));
 			err = SET_ERROR(ERESTART);
 		}
 	}
 
 	if (err == 0) {
 		err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 		    FALSE, asize > usize, tr_list, tx, TRUE);
 	}
 
 	if (err != 0)
 		dsl_dir_tempreserve_clear(tr_list, tx);
 	else
 		*tr_cookiep = tr_list;
 
 	return (err);
 }
 
 /*
  * Clear a temporary reservation that we previously made with
  * dsl_dir_tempreserve_space().
  */
 void
 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 {
 	int txgidx = tx->tx_txg & TXG_MASK;
 	list_t *tr_list = tr_cookie;
 	struct tempreserve *tr;
 
 	ASSERT3U(tx->tx_txg, !=, 0);
 
 	if (tr_cookie == NULL)
 		return;
 
 	while ((tr = list_head(tr_list)) != NULL) {
 		if (tr->tr_ds) {
 			mutex_enter(&tr->tr_ds->dd_lock);
 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 			    tr->tr_size);
 			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 			mutex_exit(&tr->tr_ds->dd_lock);
 		} else {
 			arc_tempreserve_clear(tr->tr_size);
 		}
 		list_remove(tr_list, tr);
 		kmem_free(tr, sizeof (struct tempreserve));
 	}
 
 	kmem_free(tr_list, sizeof (list_t));
 }
 
 /*
  * This should be called from open context when we think we're going to write
  * or free space, for example when dirtying data. Be conservative; it's okay
  * to write less space or free more, but we don't want to write more or free
  * less than the amount specified.
  */
 void
 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 {
 	int64_t parent_space;
 	uint64_t est_used;
 
 	mutex_enter(&dd->dd_lock);
 	if (space > 0)
 		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 
-	est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
+	est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
 	parent_space = parent_delta(dd, est_used, space);
 	mutex_exit(&dd->dd_lock);
 
 	/* Make sure that we clean up dd_space_to* */
 	dsl_dir_dirty(dd, tx);
 
 	/* XXX this is potentially expensive and unnecessary... */
 	if (parent_space && dd->dd_parent)
 		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
 }
 
 /* call from syncing context when we actually write/free space for this dd */
 void
 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 {
 	int64_t accounted_delta;
 
 	/*
 	 * dsl_dataset_set_refreservation_sync_impl() calls this with
 	 * dd_lock held, so that it can atomically update
 	 * ds->ds_reserved and the dsl_dir accounting, so that
 	 * dsl_dataset_check_quota() can see dataset and dir accounting
 	 * consistently.
 	 */
 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(type < DD_USED_NUM);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	if (needlock)
 		mutex_enter(&dd->dd_lock);
-	accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
-	ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
+	accounted_delta =
+	    parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
+	ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
 	ASSERT(compressed >= 0 ||
-	    dd->dd_phys->dd_compressed_bytes >= -compressed);
+	    dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
 	ASSERT(uncompressed >= 0 ||
-	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
-	dd->dd_phys->dd_used_bytes += used;
-	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
-	dd->dd_phys->dd_compressed_bytes += compressed;
+	    dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
+	dsl_dir_phys(dd)->dd_used_bytes += used;
+	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
+	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
 
-	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		ASSERT(used > 0 ||
-		    dd->dd_phys->dd_used_breakdown[type] >= -used);
-		dd->dd_phys->dd_used_breakdown[type] += used;
+		    dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
+		dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
 #ifdef DEBUG
 		dd_used_t t;
 		uint64_t u = 0;
 		for (t = 0; t < DD_USED_NUM; t++)
-			u += dd->dd_phys->dd_used_breakdown[t];
-		ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
+			u += dsl_dir_phys(dd)->dd_used_breakdown[t];
+		ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
 #endif
 	}
 	if (needlock)
 		mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent != NULL) {
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
 		    accounted_delta, compressed, uncompressed, tx);
 		dsl_dir_transfer_space(dd->dd_parent,
 		    used - accounted_delta,
 		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
 	}
 }
 
 void
 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(oldtype < DD_USED_NUM);
 	ASSERT(newtype < DD_USED_NUM);
 
-	if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
+	if (delta == 0 ||
+	    !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
 		return;
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	mutex_enter(&dd->dd_lock);
 	ASSERT(delta > 0 ?
-	    dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
-	    dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
-	ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
-	dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
-	dd->dd_phys->dd_used_breakdown[newtype] += delta;
+	    dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
+	    dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
+	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
+	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
+	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
 	mutex_exit(&dd->dd_lock);
 }
 
 typedef struct dsl_dir_set_qr_arg {
 	const char *ddsqra_name;
 	zprop_source_t ddsqra_source;
 	uint64_t ddsqra_value;
 } dsl_dir_set_qr_arg_t;
 
 static int
 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 	uint64_t towrite, newval;
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	error = dsl_prop_predict(ds->ds_dir, "quota",
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	if (newval == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	mutex_enter(&ds->ds_dir->dd_lock);
 	/*
 	 * If we are doing the preliminary check in open context, and
 	 * there are pending changes, then don't fail it, since the
 	 * pending changes could under-estimate the amount of space to be
 	 * freed up.
 	 */
 	towrite = dsl_dir_space_towrite(ds->ds_dir);
 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-	    (newval < ds->ds_dir->dd_phys->dd_reserved ||
-	    newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) {
+	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
+	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
 		error = SET_ERROR(ENOSPC);
 	}
 	mutex_exit(&ds->ds_dir->dd_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 static void
 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t newval;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 		    &ddsqra->ddsqra_value, tx);
 
 		VERIFY0(dsl_prop_get_int_ds(ds,
 		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
 	} else {
 		newval = ddsqra->ddsqra_value;
 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
 		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
 	}
 
 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 	mutex_enter(&ds->ds_dir->dd_lock);
-	ds->ds_dir->dd_phys->dd_quota = newval;
+	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
 	mutex_exit(&ds->ds_dir->dd_lock);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 {
 	dsl_dir_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = ddname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = quota;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
 	    dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 int
 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	uint64_t newval, used, avail;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 	dd = ds->ds_dir;
 
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	error = dsl_prop_predict(ds->ds_dir,
 	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	mutex_enter(&dd->dd_lock);
-	used = dd->dd_phys->dd_used_bytes;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
 	mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent) {
 		avail = dsl_dir_space_available(dd->dd_parent,
 		    NULL, 0, FALSE);
 	} else {
 		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
 	}
 
-	if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) {
+	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
 		uint64_t delta = MAX(used, newval) -
-		    MAX(used, dd->dd_phys->dd_reserved);
+		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
 
 		if (delta > avail ||
-		    (dd->dd_phys->dd_quota > 0 &&
-		    newval > dd->dd_phys->dd_quota))
+		    (dsl_dir_phys(dd)->dd_quota > 0 &&
+		    newval > dsl_dir_phys(dd)->dd_quota))
 			error = SET_ERROR(ENOSPC);
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 void
 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
 {
 	uint64_t used;
 	int64_t delta;
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	mutex_enter(&dd->dd_lock);
-	used = dd->dd_phys->dd_used_bytes;
-	delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
-	dd->dd_phys->dd_reserved = value;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
+	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
+	dsl_dir_phys(dd)->dd_reserved = value;
 
 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
 		    delta, 0, 0, tx);
 	}
 	mutex_exit(&dd->dd_lock);
 }
 
 static void
 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t newval;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
 		dsl_prop_set_sync_impl(ds,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 		    &ddsqra->ddsqra_value, tx);
 
 		VERIFY0(dsl_prop_get_int_ds(ds,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
 	} else {
 		newval = ddsqra->ddsqra_value;
 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    (longlong_t)newval);
 	}
 
 	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation)
 {
 	dsl_dir_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = ddname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = reservation;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
 	    dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 static dsl_dir_t *
 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
 {
 	for (; ds1; ds1 = ds1->dd_parent) {
 		dsl_dir_t *dd;
 		for (dd = ds2; dd; dd = dd->dd_parent) {
 			if (ds1 == dd)
 				return (dd);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * If delta is applied to dd, how much of that delta would be applied to
  * ancestor?  Syncing context only.
  */
 static int64_t
 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
 {
 	if (dd == ancestor)
 		return (delta);
 
 	mutex_enter(&dd->dd_lock);
-	delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
+	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
 	mutex_exit(&dd->dd_lock);
 	return (would_change(dd->dd_parent, delta, ancestor));
 }
 
 typedef struct dsl_dir_rename_arg {
 	const char *ddra_oldname;
 	const char *ddra_newname;
 	cred_t *ddra_cred;
 } dsl_dir_rename_arg_t;
 
 /* ARGSUSED */
 static int
 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	int *deltap = arg;
 	char namebuf[MAXNAMELEN];
 
 	dsl_dataset_name(ds, namebuf);
 
 	if (strlen(namebuf) + *deltap >= MAXNAMELEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	return (0);
 }
 
 static int
 dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_rename_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd, *newparent;
 	const char *mynewname;
 	int error;
 	int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
 
 	/* target dir should exist */
 	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
 	if (error != 0)
 		return (error);
 
 	/* new parent should exist */
 	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
 	    &newparent, &mynewname);
 	if (error != 0) {
 		dsl_dir_rele(dd, FTAG);
 		return (error);
 	}
 
 	/* can't rename to different pool */
 	if (dd->dd_pool != newparent->dd_pool) {
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (SET_ERROR(ENXIO));
 	}
 
 	/* new name should not already exist */
 	if (mynewname == NULL) {
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/* if the name length is growing, validate child name lengths */
 	if (delta > 0) {
 		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
 		    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (error);
 		}
 	}
 
 	if (dmu_tx_is_syncing(tx)) {
 		if (spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_FS_SS_LIMIT)) {
 			/*
 			 * Although this is the check function and we don't
 			 * normally make on-disk changes in check functions,
 			 * we need to do that here.
 			 *
 			 * Ensure this portion of the tree's counts have been
 			 * initialized in case the new parent has limits set.
 			 */
 			dsl_dir_init_fs_ss_count(dd, tx);
 		}
 	}
 
 	if (newparent != dd->dd_parent) {
 		/* is there enough space? */
 		uint64_t myspace =
-		    MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
+		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
+		    dsl_dir_phys(dd)->dd_reserved);
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		uint64_t fs_cnt = 0;
 		uint64_t ss_cnt = 0;
 
 		if (dsl_dir_is_zapified(dd)) {
 			int err;
 
 			err = zap_lookup(os, dd->dd_object,
 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
 			    &fs_cnt);
 			if (err != ENOENT && err != 0) {
 				dsl_dir_rele(newparent, FTAG);
 				dsl_dir_rele(dd, FTAG);
 				return (err);
 			}
 
 			/*
 			 * have to add 1 for the filesystem itself that we're
 			 * moving
 			 */
 			fs_cnt++;
 
 			err = zap_lookup(os, dd->dd_object,
 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
 			    &ss_cnt);
 			if (err != ENOENT && err != 0) {
 				dsl_dir_rele(newparent, FTAG);
 				dsl_dir_rele(dd, FTAG);
 				return (err);
 			}
 		}
 
 		/* no rename into our descendant */
 		if (closest_common_ancestor(dd, newparent) == dd) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = dsl_dir_transfer_possible(dd->dd_parent,
 		    newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (error);
 		}
 	}
 
 	dsl_dir_rele(newparent, FTAG);
 	dsl_dir_rele(dd, FTAG);
 	return (0);
 }
 
 static void
 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_rename_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd, *newparent;
 	const char *mynewname;
 	int error;
 	objset_t *mos = dp->dp_meta_objset;
 
 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
 	    &mynewname));
 
 	/* Log this before we change the name. */
 	spa_history_log_internal_dd(dd, "rename", tx,
 	    "-> %s", ddra->ddra_newname);
 
 	if (newparent != dd->dd_parent) {
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		uint64_t fs_cnt = 0;
 		uint64_t ss_cnt = 0;
 
 		/*
 		 * We already made sure the dd counts were initialized in the
 		 * check function.
 		 */
 		if (spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_FS_SS_LIMIT)) {
 			VERIFY0(zap_lookup(os, dd->dd_object,
 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
 			    &fs_cnt));
 			/* add 1 for the filesystem itself that we're moving */
 			fs_cnt++;
 
 			VERIFY0(zap_lookup(os, dd->dd_object,
 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
 			    &ss_cnt));
 		}
 
 		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
 		    DD_FIELD_FILESYSTEM_COUNT, tx);
 		dsl_fs_ss_count_adjust(newparent, fs_cnt,
 		    DD_FIELD_FILESYSTEM_COUNT, tx);
 
 		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 		dsl_fs_ss_count_adjust(newparent, ss_cnt,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
-		    -dd->dd_phys->dd_used_bytes,
-		    -dd->dd_phys->dd_compressed_bytes,
-		    -dd->dd_phys->dd_uncompressed_bytes, tx);
+		    -dsl_dir_phys(dd)->dd_used_bytes,
+		    -dsl_dir_phys(dd)->dd_compressed_bytes,
+		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
 		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
-		    dd->dd_phys->dd_used_bytes,
-		    dd->dd_phys->dd_compressed_bytes,
-		    dd->dd_phys->dd_uncompressed_bytes, tx);
+		    dsl_dir_phys(dd)->dd_used_bytes,
+		    dsl_dir_phys(dd)->dd_compressed_bytes,
+		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
 
-		if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
-			uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
-			    dd->dd_phys->dd_used_bytes;
+		if (dsl_dir_phys(dd)->dd_reserved >
+		    dsl_dir_phys(dd)->dd_used_bytes) {
+			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
+			    dsl_dir_phys(dd)->dd_used_bytes;
 
 			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
 			    -unused_rsrv, 0, 0, tx);
 			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
 			    unused_rsrv, 0, 0, tx);
 		}
 	}
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	/* remove from old parent zapobj */
-	error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+	error = zap_remove(mos,
+	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
 	    dd->dd_myname, tx);
 	ASSERT0(error);
 
 	(void) strcpy(dd->dd_myname, mynewname);
 	dsl_dir_rele(dd->dd_parent, dd);
-	dd->dd_phys->dd_parent_obj = newparent->dd_object;
+	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
 	VERIFY0(dsl_dir_hold_obj(dp,
 	    newparent->dd_object, NULL, dd, &dd->dd_parent));
 
 	/* add to new parent zapobj */
-	VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj,
+	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
 	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
 	zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname);
 #endif
 #endif
 
 	dsl_prop_notify_all(dd);
 
 	dsl_dir_rele(newparent, FTAG);
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
 dsl_dir_rename(const char *oldname, const char *newname)
 {
 	dsl_dir_rename_arg_t ddra;
 
 	ddra.ddra_oldname = oldname;
 	ddra.ddra_newname = newname;
 	ddra.ddra_cred = CRED();
 
 	return (dsl_sync_task(oldname,
 	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
 	    3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
     uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
 {
 	dsl_dir_t *ancestor;
 	int64_t adelta;
 	uint64_t avail;
 	int err;
 
 	ancestor = closest_common_ancestor(sdd, tdd);
 	adelta = would_change(sdd, -space, ancestor);
 	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
 	if (avail < space)
 		return (SET_ERROR(ENOSPC));
 
 	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
 	    ancestor, cr);
 	if (err != 0)
 		return (err);
 	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
 	    ancestor, cr);
 	if (err != 0)
 		return (err);
 
 	return (0);
 }
 
 timestruc_t
 dsl_dir_snap_cmtime(dsl_dir_t *dd)
 {
 	timestruc_t t;
 
 	mutex_enter(&dd->dd_lock);
 	t = dd->dd_snap_cmtime;
 	mutex_exit(&dd->dd_lock);
 
 	return (t);
 }
 
 void
 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
 {
 	timestruc_t t;
 
 	gethrestime(&t);
 	mutex_enter(&dd->dd_lock);
 	dd->dd_snap_cmtime = t;
 	mutex_exit(&dd->dd_lock);
 }
 
 void
 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
 }
 
 boolean_t
 dsl_dir_is_zapified(dsl_dir_t *dd)
 {
 	dmu_object_info_t doi;
 
 	dmu_object_info_from_db(dd->dd_dbuf, &doi);
 	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	(revision 275782)
@@ -1,1144 +1,1148 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_scan.h>
 #include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_userhold.h>
 
 #ifdef __FreeBSD__
 #include <sys/sysctl.h>
 #include <sys/types.h>
 #endif
 
 /*
  * ZFS Write Throttle
  * ------------------
  *
  * ZFS must limit the rate of incoming writes to the rate at which it is able
  * to sync data modifications to the backend storage. Throttling by too much
  * creates an artificial limit; throttling by too little can only be sustained
  * for short periods and would lead to highly lumpy performance. On a per-pool
  * basis, ZFS tracks the amount of modified (dirty) data. As operations change
  * data, the amount of dirty data increases; as ZFS syncs out data, the amount
  * of dirty data decreases. When the amount of dirty data exceeds a
  * predetermined threshold further modifications are blocked until the amount
  * of dirty data decreases (as data is synced out).
  *
  * The limit on dirty data is tunable, and should be adjusted according to
  * both the IO capacity and available memory of the system. The larger the
  * window, the more ZFS is able to aggregate and amortize metadata (and data)
  * changes. However, memory is a limited resource, and allowing for more dirty
  * data comes at the cost of keeping other useful data in memory (for example
  * ZFS data cached by the ARC).
  *
  * Implementation
  *
  * As buffers are modified dsl_pool_willuse_space() increments both the per-
  * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
  * dirty space used; dsl_pool_dirty_space() decrements those values as data
  * is synced out from dsl_pool_sync(). While only the poolwide value is
  * relevant, the per-txg value is useful for debugging. The tunable
  * zfs_dirty_data_max determines the dirty space limit. Once that value is
  * exceeded, new writes are halted until space frees up.
  *
  * The zfs_dirty_data_sync tunable dictates the threshold at which we
  * ensure that there is a txg syncing (see the comment in txg.c for a full
  * description of transaction group stages).
  *
  * The IO scheduler uses both the dirty space limit and current amount of
  * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
  * issues. See the comment in vdev_queue.c for details of the IO scheduler.
  *
  * The delay is also calculated based on the amount of dirty data.  See the
  * comment above dmu_tx_delay() for details.
  */
 
 /*
  * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
  * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
  */
 uint64_t zfs_dirty_data_max;
 uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
 int zfs_dirty_data_max_percent = 10;
 
 /*
  * If there is at least this much dirty data, push out a txg.
  */
 uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
 
 /*
  * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
  * and delay each transaction.
  * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
  */
 int zfs_delay_min_dirty_percent = 60;
 
 /*
  * This controls how quickly the delay approaches infinity.
  * Larger values cause it to delay more for a given amount of dirty data.
  * Therefore larger values will cause there to be less dirty data for a
  * given throughput.
  *
  * For the smoothest delay, this value should be about 1 billion divided
  * by the maximum number of operations per second.  This will smoothly
  * handle between 10x and 1/10th this number.
  *
  * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
  * multiply in dmu_tx_delay().
  */
 uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 
 
 #ifdef __FreeBSD__
 
 extern int zfs_vdev_async_write_active_max_dirty_percent;
 
 SYSCTL_DECL(_vfs_zfs);
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
     &zfs_dirty_data_max, 0,
     "The maximum amount of dirty data in bytes after which new writes are "
     "halted until space becomes available");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
     &zfs_dirty_data_max_max, 0,
     "The absolute cap on dirty_data_max when auto calculating");
 
 static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
     sysctl_zfs_dirty_data_max_percent, "I",
     "The percent of physical memory used to auto calculate dirty_data_max");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN,
     &zfs_dirty_data_sync, 0,
     "Force a txg if the number of dirty buffer bytes exceed this value");
 
 static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
 /* No zfs_delay_min_dirty_percent tunable due to limit requirements */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
     sysctl_zfs_delay_min_dirty_percent, "I",
     "The limit of outstanding dirty data before transations are delayed");
 
 static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
 /* No zfs_delay_scale tunable due to limit requirements */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
     CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
     sysctl_zfs_delay_scale, "QU",
     "Controls how quickly the delay approaches infinity");
 
 static int
 sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
 {
 	int val, err;
 
 	val = zfs_dirty_data_max_percent;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < 0 || val > 100)
 		return (EINVAL);
 
 	zfs_dirty_data_max_percent = val;
 
 	return (0);
 }
 
 static int
 sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
 {
 	int val, err;
 
 	val = zfs_delay_min_dirty_percent;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < zfs_vdev_async_write_active_max_dirty_percent)
 		return (EINVAL);
 
 	zfs_delay_min_dirty_percent = val;
 
 	return (0);
 }
 
 static int
 sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t val;
 	int err;
 
 	val = zfs_delay_scale;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val > UINT64_MAX / zfs_dirty_data_max)
 		return (EINVAL);
 
 	zfs_delay_scale = val;
 
 	return (0);
 }
 #endif
 
 hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
 hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
 
 int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
 	uint64_t obj;
 	int err;
 
 	err = zap_lookup(dp->dp_meta_objset,
-	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+	    dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
 	    name, sizeof (obj), 1, &obj);
 	if (err)
 		return (err);
 
 	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
 }
 
 static dsl_pool_t *
 dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp;
 	blkptr_t *bp = spa_get_rootblkptr(spa);
 
 	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
 	dp->dp_spa = spa;
 	dp->dp_meta_rootbp = *bp;
 	rrw_init(&dp->dp_config_rwlock, B_TRUE);
 	txg_init(dp, txg);
 
 	txg_list_create(&dp->dp_dirty_datasets,
 	    offsetof(dsl_dataset_t, ds_dirty_link));
 	txg_list_create(&dp->dp_dirty_zilogs,
 	    offsetof(zilog_t, zl_dirty_link));
 	txg_list_create(&dp->dp_dirty_dirs,
 	    offsetof(dsl_dir_t, dd_dirty_link));
 	txg_list_create(&dp->dp_sync_tasks,
 	    offsetof(dsl_sync_task_t, dst_node));
 
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
 	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 	    1, 4, 0);
 
 	return (dp);
 }
 
 int
 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 
 	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 	    &dp->dp_meta_objset);
 	if (err != 0)
 		dsl_pool_close(dp);
 	else
 		*dpp = dp;
 
 	return (err);
 }
 
 int
 dsl_pool_open(dsl_pool_t *dp)
 {
 	int err;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 	    &dp->dp_root_dir_obj);
 	if (err)
 		goto out;
 
 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir);
 	if (err)
 		goto out;
 
 	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 	if (err)
 		goto out;
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 		if (err)
 			goto out;
-		err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
-		    FTAG, &ds);
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
 		if (err == 0) {
 			err = dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, dp,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
 			    &dp->dp_origin_snap);
 			dsl_dataset_rele(ds, FTAG);
 		}
 		dsl_dir_rele(dd, dp);
 		if (err)
 			goto out;
 	}
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 		    &dp->dp_free_dir);
 		if (err)
 			goto out;
 
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 		if (err)
 			goto out;
 		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 		    dp->dp_meta_objset, obj));
 	}
 
 	/*
 	 * Note: errors ignored, because the leak dir will not exist if we
 	 * have not encountered a leak yet.
 	 */
 	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
 	    &dp->dp_leak_dir);
 
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 		    &dp->dp_bptree_obj);
 		if (err != 0)
 			goto out;
 	}
 
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 		    &dp->dp_empty_bpobj);
 		if (err != 0)
 			goto out;
 	}
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 	    &dp->dp_tmp_userrefs_obj);
 	if (err == ENOENT)
 		err = 0;
 	if (err)
 		goto out;
 
 	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 
 out:
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 	return (err);
 }
 
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
 	/*
 	 * Drop our references from dsl_pool_open().
 	 *
 	 * Since we held the origin_snap from "syncing" context (which
 	 * includes pool-opening context), it actually only got a "ref"
 	 * and not a hold, so just drop that here.
 	 */
 	if (dp->dp_origin_snap)
 		dsl_dataset_rele(dp->dp_origin_snap, dp);
 	if (dp->dp_mos_dir)
 		dsl_dir_rele(dp->dp_mos_dir, dp);
 	if (dp->dp_free_dir)
 		dsl_dir_rele(dp->dp_free_dir, dp);
 	if (dp->dp_leak_dir)
 		dsl_dir_rele(dp->dp_leak_dir, dp);
 	if (dp->dp_root_dir)
 		dsl_dir_rele(dp->dp_root_dir, dp);
 
 	bpobj_close(&dp->dp_free_bpobj);
 
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 	if (dp->dp_meta_objset)
 		dmu_objset_evict(dp->dp_meta_objset);
 
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_zilogs);
 	txg_list_destroy(&dp->dp_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
 	arc_flush(dp->dp_spa);
 	txg_fini(dp);
 	dsl_scan_fini(dp);
 	rrw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	taskq_destroy(dp->dp_vnrele_taskq);
 	if (dp->dp_blkstats)
 		kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 	kmem_free(dp, sizeof (dsl_pool_t));
 }
 
 dsl_pool_t *
 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 	objset_t *os;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	/* create and open the MOS (meta-objset) */
 	dp->dp_meta_objset = dmu_objset_create_impl(spa,
 	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 
 	/* create the pool directory */
 	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 	ASSERT0(err);
 
 	/* Initialize scan structures */
 	VERIFY0(dsl_scan_init(dp, txg));
 
 	/* create and open the root dir */
 	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir));
 
 	/* create and open the meta-objset dir */
 	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 	VERIFY0(dsl_pool_open_special_dir(dp,
 	    MOS_DIR_NAME, &dp->dp_mos_dir));
 
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		/* create and open the free dir */
 		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 		    FREE_DIR_NAME, tx);
 		VERIFY0(dsl_pool_open_special_dir(dp,
 		    FREE_DIR_NAME, &dp->dp_free_dir));
 
 		/* create and open the free_bplist */
 		obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
 		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 		    dp->dp_meta_objset, obj));
 	}
 
 	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 		dsl_pool_create_origin(dp, tx);
 
 	/* create the root dataset */
 	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 
 	/* create the root objset */
 	VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 	os = dmu_objset_create_impl(dp->dp_spa, ds,
 	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 #ifdef _KERNEL
 	zfs_create_fs(os, kcred, zplprops, tx);
 #endif
 	dsl_dataset_rele(ds, FTAG);
 
 	dmu_tx_commit(tx);
 
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 
 	return (dp);
 }
 
 /*
  * Account for the meta-objset space in its placeholder dsl_dir.
  */
 void
 dsl_pool_mos_diduse_space(dsl_pool_t *dp,
     int64_t used, int64_t comp, int64_t uncomp)
 {
 	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 	mutex_enter(&dp->dp_lock);
 	dp->dp_mos_used_delta += used;
 	dp->dp_mos_compressed_delta += comp;
 	dp->dp_mos_uncompressed_delta += uncomp;
 	mutex_exit(&dp->dp_lock);
 }
 
 static int
 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_deadlist_t *dl = arg;
 	dsl_deadlist_insert(dl, bp, tx);
 	return (0);
 }
 
 static void
 dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
 	VERIFY0(zio_wait(zio));
 	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 }
 
 static void
 dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 {
 	ASSERT(MUTEX_HELD(&dp->dp_lock));
 
 	if (delta < 0)
 		ASSERT3U(-delta, <=, dp->dp_dirty_total);
 
 	dp->dp_dirty_total += delta;
 
 	/*
 	 * Note: we signal even when increasing dp_dirty_total.
 	 * This ensures forward progress -- each thread wakes the next waiter.
 	 */
 	if (dp->dp_dirty_total <= zfs_dirty_data_max)
 		cv_signal(&dp->dp_spaceavail_cv);
 }
 
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
 	zio_t *zio;
 	dmu_tx_t *tx;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	objset_t *mos = dp->dp_meta_objset;
 	list_t synced_datasets;
 
 	list_create(&synced_datasets, sizeof (dsl_dataset_t),
 	    offsetof(dsl_dataset_t, ds_synced_link));
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Write out all dirty blocks of dirty datasets.
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		/*
 		 * We must not sync any non-MOS datasets twice, because
 		 * we may have taken a snapshot of them.  However, we
 		 * may sync newly-created datasets on pass 2.
 		 */
 		ASSERT(!list_link_active(&ds->ds_synced_link));
 		list_insert_tail(&synced_datasets, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
 	VERIFY0(zio_wait(zio));
 
 	/*
 	 * We have written all of the accounted dirty data, so our
 	 * dp_space_towrite should now be zero.  However, some seldom-used
 	 * code paths do not adhere to this (e.g. dbuf_undirty(), also
 	 * rounding error in dbuf_write_physdone).
 	 * Shore up the accounting of any dirtied space now.
 	 */
 	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
 
 	/*
 	 * After the data blocks have been written (ensured by the zio_wait()
 	 * above), update the user/group space accounting.
 	 */
 	for (ds = list_head(&synced_datasets); ds != NULL;
 	    ds = list_next(&synced_datasets, ds)) {
 		dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 	}
 
 	/*
 	 * Sync the datasets again to push out the changes due to
 	 * userspace updates.  This must be done before we process the
 	 * sync tasks, so that any snapshots will have the correct
 	 * user accounting information (and we won't get confused
 	 * about which blocks are part of the snapshot).
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		ASSERT(list_link_active(&ds->ds_synced_link));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
 	VERIFY0(zio_wait(zio));
 
 	/*
 	 * Now that the datasets have been completely synced, we can
 	 * clean up our in-memory structures accumulated while syncing:
 	 *
 	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
 	 *  - release hold from dsl_dataset_dirty()
 	 */
 	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
 		objset_t *os = ds->ds_objset;
 		bplist_iterate(&ds->ds_pending_deadlist,
 		    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 		ASSERT(!dmu_objset_is_dirty(os, txg));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 	}
 	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
 		dsl_dir_sync(dd, tx);
 	}
 
 	/*
 	 * The MOS's space is accounted for in the pool/$MOS
 	 * (dp_mos_dir).  We can't modify the mos while we're syncing
 	 * it, so we remember the deltas and apply them here.
 	 */
 	if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 	    dp->dp_mos_uncompressed_delta != 0) {
 		dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 		    dp->dp_mos_used_delta,
 		    dp->dp_mos_compressed_delta,
 		    dp->dp_mos_uncompressed_delta, tx);
 		dp->dp_mos_used_delta = 0;
 		dp->dp_mos_compressed_delta = 0;
 		dp->dp_mos_uncompressed_delta = 0;
 	}
 
 	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 		dsl_pool_sync_mos(dp, tx);
 	}
 
 	/*
 	 * If we modify a dataset in the same txg that we want to destroy it,
 	 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 	 * dsl_dir_destroy_check() will fail if there are unexpected holds.
 	 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 	 * and clearing the hold on it) before we process the sync_tasks.
 	 * The MOS data dirtied by the sync_tasks will be synced on the next
 	 * pass.
 	 */
 	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 		dsl_sync_task_t *dst;
 		/*
 		 * No more sync tasks should have been added while we
 		 * were syncing.
 		 */
 		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
 		while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
 			dsl_sync_task_sync(dst, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
 }
 
 void
 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
 	zilog_t *zilog;
 
 	while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 		zil_clean(zilog, txg);
 		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 		dmu_buf_rele(ds->ds_dbuf, zilog);
 	}
 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
 
 /*
  * TRUE if the current thread is the tx_sync_thread or if we
  * are being called from SPA context during pool initialization.
  */
 int
 dsl_pool_sync_context(dsl_pool_t *dp)
 {
 	return (curthread == dp->dp_tx.tx_sync_thread ||
 	    spa_is_initializing(dp->dp_spa));
 }
 
 uint64_t
 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 {
 	uint64_t space, resv;
 
 	/*
 	 * If we're trying to assess whether it's OK to do a free,
 	 * cut the reservation in half to allow forward progress
 	 * (e.g. make it possible to rm(1) files from a full pool).
 	 */
 	space = spa_get_dspace(dp->dp_spa);
 	resv = spa_get_slop_space(dp->dp_spa);
 	if (netfree)
 		resv >>= 1;
 
 	return (space - resv);
 }
 
 boolean_t
 dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 {
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 	boolean_t rv;
 
 	mutex_enter(&dp->dp_lock);
 	if (dp->dp_dirty_total > zfs_dirty_data_sync)
 		txg_kick(dp);
 	rv = (dp->dp_dirty_total > delay_min_bytes);
 	mutex_exit(&dp->dp_lock);
 	return (rv);
 }
 
 void
 dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 {
 	if (space > 0) {
 		mutex_enter(&dp->dp_lock);
 		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
 		dsl_pool_dirty_delta(dp, space);
 		mutex_exit(&dp->dp_lock);
 	}
 }
 
 void
 dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 {
 	ASSERT3S(space, >=, 0);
 	if (space == 0)
 		return;
 	mutex_enter(&dp->dp_lock);
 	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
 		/* XXX writing something we didn't dirty? */
 		space = dp->dp_dirty_pertxg[txg & TXG_MASK];
 	}
 	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
 	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
 	ASSERT3U(dp->dp_dirty_total, >=, space);
 	dsl_pool_dirty_delta(dp, -space);
 	mutex_exit(&dp->dp_lock);
 }
 
 /* ARGSUSED */
 static int
 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	dsl_dataset_t *ds, *prev = NULL;
 	int err;
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
-	while (ds->ds_phys->ds_prev_snap_obj != 0) {
-		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-		    FTAG, &prev);
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
+		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
 			break;
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
 		prev = NULL;
 	}
 
 	if (prev == NULL) {
 		prev = dp->dp_origin_snap;
 
 		/*
 		 * The $ORIGIN can't have any data, or the accounting
 		 * will be wrong.
 		 */
-		ASSERT0(prev->ds_phys->ds_bp.blk_birth);
+		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
 
 		/* The origin doesn't get attached to itself */
 		if (ds->ds_object == prev->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			return (0);
 		}
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
-		ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
+		dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
+		dsl_dataset_phys(ds)->ds_prev_snap_txg =
+		    dsl_dataset_phys(prev)->ds_creation_txg;
 
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-		ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
+		dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
 
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
-		prev->ds_phys->ds_num_children++;
+		dsl_dataset_phys(prev)->ds_num_children++;
 
-		if (ds->ds_phys->ds_next_snap_obj == 0) {
+		if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
 			ASSERT(ds->ds_prev == NULL);
 			VERIFY0(dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
+			    ds, &ds->ds_prev));
 		}
 	}
 
-	ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
-	ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
+	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
 
-	if (prev->ds_phys->ds_next_clones_obj == 0) {
+	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
-		prev->ds_phys->ds_next_clones_obj =
+		dsl_dataset_phys(prev)->ds_next_clones_obj =
 		    zap_create(dp->dp_meta_objset,
 		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 	}
 	VERIFY0(zap_add_int(dp->dp_meta_objset,
-	    prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
+	    dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
 
 	dsl_dataset_rele(ds, FTAG);
 	if (prev != dp->dp_origin_snap)
 		dsl_dataset_rele(prev, FTAG);
 	return (0);
 }
 
 void
 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap != NULL);
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 	    tx, DS_FIND_CHILDREN));
 }
 
 /* ARGSUSED */
 static int
 upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	objset_t *mos = dp->dp_meta_objset;
 
-	if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
 		dsl_dataset_t *origin;
 
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+		    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
 
-		if (origin->ds_dir->dd_phys->dd_clones == 0) {
+		if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
-			origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
-			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+			dsl_dir_phys(origin->ds_dir)->dd_clones =
+			    zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
+			    0, tx);
 		}
 
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
+		    dsl_dir_phys(origin->ds_dir)->dd_clones,
+		    ds->ds_object, tx));
 
 		dsl_dataset_rele(origin, FTAG);
 	}
 	return (0);
 }
 
 void
 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	uint64_t obj;
 
 	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 	VERIFY0(dsl_pool_open_special_dir(dp,
 	    FREE_DIR_NAME, &dp->dp_free_dir));
 
 	/*
 	 * We can't use bpobj_alloc(), because spa_version() still
 	 * returns the old version, and we need a new-version bpobj with
 	 * subobj support.  So call dmu_object_alloc() directly.
 	 */
 	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 }
 
 void
 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	uint64_t dsobj;
 	dsl_dataset_t *ds;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap == NULL);
 	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 
 	/* create the origin dir, ds, & snap-ds */
 	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 	    NULL, 0, kcred, tx);
 	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
-	VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
 	    dp, &dp->dp_origin_snap));
 	dsl_dataset_rele(ds, FTAG);
 }
 
 taskq_t *
 dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 {
 	return (dp->dp_vnrele_taskq);
 }
 
 /*
  * Walk through the pool-wide zap object of temporary snapshot user holds
  * and release them.
  */
 void
 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 {
 	zap_attribute_t za;
 	zap_cursor_t zc;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 	nvlist_t *holds;
 
 	if (zapobj == 0)
 		return;
 	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 
 	holds = fnvlist_alloc();
 
 	for (zap_cursor_init(&zc, mos, zapobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		char *htag;
 		nvlist_t *tags;
 
 		htag = strchr(za.za_name, '-');
 		*htag = '\0';
 		++htag;
 		if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
 			tags = fnvlist_alloc();
 			fnvlist_add_boolean(tags, htag);
 			fnvlist_add_nvlist(holds, za.za_name, tags);
 			fnvlist_free(tags);
 		} else {
 			fnvlist_add_boolean(tags, htag);
 		}
 	}
 	dsl_dataset_user_release_tmp(dp, holds);
 	fnvlist_free(holds);
 	zap_cursor_fini(&zc);
 }
 
 /*
  * Create the pool-wide zap object for storing temporary snapshot holds.
  */
 void
 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
 
 	ASSERT(dp->dp_tmp_userrefs_obj == 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 }
 
 static int
 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
     const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 	char *name;
 	int error;
 
 	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	/*
 	 * If the pool was created prior to SPA_VERSION_USERREFS, the
 	 * zap object for temporary holds might not exist yet.
 	 */
 	if (zapobj == 0) {
 		if (holding) {
 			dsl_pool_user_hold_create_obj(dp, tx);
 			zapobj = dp->dp_tmp_userrefs_obj;
 		} else {
 			return (SET_ERROR(ENOENT));
 		}
 	}
 
 	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 	if (holding)
 		error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 	else
 		error = zap_remove(mos, zapobj, name, tx);
 	strfree(name);
 
 	return (error);
 }
 
 /*
  * Add a temporary hold for the given dataset object and tag.
  */
 int
 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     uint64_t now, dmu_tx_t *tx)
 {
 	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 }
 
 /*
  * Release a temporary hold for the given dataset object and tag.
  */
 int
 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     dmu_tx_t *tx)
 {
 	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
 	    tx, B_FALSE));
 }
 
 /*
  * DSL Pool Configuration Lock
  *
  * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
  * creation / destruction / rename / property setting).  It must be held for
  * read to hold a dataset or dsl_dir.  I.e. you must call
  * dsl_pool_config_enter() or dsl_pool_hold() before calling
  * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
  * must be held continuously until all datasets and dsl_dirs are released.
  *
  * The only exception to this rule is that if a "long hold" is placed on
  * a dataset, then the dp_config_rwlock may be dropped while the dataset
  * is still held.  The long hold will prevent the dataset from being
  * destroyed -- the destroy will fail with EBUSY.  A long hold can be
  * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
  * (by calling dsl_{dataset,objset}_{try}own{_obj}).
  *
  * Legitimate long-holders (including owners) should be long-running, cancelable
  * tasks that should cause "zfs destroy" to fail.  This includes DMU
  * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
  * "zfs send", and "zfs diff".  There are several other long-holders whose
  * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
  *
  * The usual formula for long-holding would be:
  * dsl_pool_hold()
  * dsl_dataset_hold()
  * ... perform checks ...
  * dsl_dataset_long_hold()
  * dsl_pool_rele()
  * ... perform long-running task ...
  * dsl_dataset_long_rele()
  * dsl_dataset_rele()
  *
  * Note that when the long hold is released, the dataset is still held but
  * the pool is not held.  The dataset may change arbitrarily during this time
  * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
  * dataset except release it.
  *
  * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
  * or modifying operations.
  *
  * Modifying operations should generally use dsl_sync_task().  The synctask
  * infrastructure enforces proper locking strategy with respect to the
  * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
  *
  * Read-only operations will manually hold the pool, then the dataset, obtain
  * information from the dataset, then release the pool and dataset.
  * dmu_objset_{hold,rele}() are convenience routines that also do the pool
  * hold/rele.
  */
 
 int
 dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(name, &spa, tag);
 	if (error == 0) {
 		*dp = spa_get_dsl(spa);
 		dsl_pool_config_enter(*dp, tag);
 	}
 	return (error);
 }
 
 void
 dsl_pool_rele(dsl_pool_t *dp, void *tag)
 {
 	dsl_pool_config_exit(dp, tag);
 	spa_close(dp->dp_spa, tag);
 }
 
 void
 dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
 {
 	/*
 	 * We use a "reentrant" reader-writer lock, but not reentrantly.
 	 *
 	 * The rrwlock can (with the track_all flag) track all reading threads,
 	 * which is very useful for debugging which code path failed to release
 	 * the lock, and for verifying that the *current* thread does hold
 	 * the lock.
 	 *
 	 * (Unlike a rwlock, which knows that N threads hold it for
 	 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
 	 * if any thread holds it for read, even if this thread doesn't).
 	 */
 	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
 	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
 }
 
 void
 dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
 {
 	rrw_exit(&dp->dp_config_rwlock, tag);
 }
 
 boolean_t
 dsl_pool_config_held(dsl_pool_t *dp)
 {
 	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c	(revision 275782)
@@ -1,1125 +1,1126 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 
 #include "zfs_prop.h"
 
 #define	ZPROP_INHERIT_SUFFIX "$inherit"
 #define	ZPROP_RECVD_SUFFIX "$recvd"
 
 static int
 dodefault(const char *propname, int intsz, int numints, void *buf)
 {
 	zfs_prop_t prop;
 
 	/*
 	 * The setonce properties are read-only, BUT they still
 	 * have a default value that can be used as the initial
 	 * value.
 	 */
 	if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL ||
 	    (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
 		return (SET_ERROR(ENOENT));
 
 	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
 		if (intsz != 1)
 			return (SET_ERROR(EOVERFLOW));
 		(void) strncpy(buf, zfs_prop_default_string(prop),
 		    numints);
 	} else {
 		if (intsz != 8 || numints < 1)
 			return (SET_ERROR(EOVERFLOW));
 
 		*(uint64_t *)buf = zfs_prop_default_numeric(prop);
 	}
 
 	return (0);
 }
 
 int
 dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
     int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
 {
 	int err = ENOENT;
 	dsl_dir_t *target = dd;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	zfs_prop_t prop;
 	boolean_t inheritable;
 	boolean_t inheriting = B_FALSE;
 	char *inheritstr;
 	char *recvdstr;
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 
 	if (setpoint)
 		setpoint[0] = '\0';
 
 	prop = zfs_name_to_prop(propname);
 	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
 	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
 	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
 
 	/*
 	 * Note: dd may become NULL, therefore we shouldn't dereference it
 	 * after this loop.
 	 */
 	for (; dd != NULL; dd = dd->dd_parent) {
 		if (dd != target || snapshot) {
 			if (!inheritable)
 				break;
 			inheriting = B_TRUE;
 		}
 
 		/* Check for a local value. */
-		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
-		    intsz, numints, buf);
+		err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    propname, intsz, numints, buf);
 		if (err != ENOENT) {
 			if (setpoint != NULL && err == 0)
 				dsl_dir_name(dd, setpoint);
 			break;
 		}
 
 		/*
 		 * Skip the check for a received value if there is an explicit
 		 * inheritance entry.
 		 */
-		err = zap_contains(mos, dd->dd_phys->dd_props_zapobj,
+		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
 		    inheritstr);
 		if (err != 0 && err != ENOENT)
 			break;
 
 		if (err == ENOENT) {
 			/* Check for a received value. */
-			err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+			err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
 			    recvdstr, intsz, numints, buf);
 			if (err != ENOENT) {
 				if (setpoint != NULL && err == 0) {
 					if (inheriting) {
 						dsl_dir_name(dd, setpoint);
 					} else {
 						(void) strcpy(setpoint,
 						    ZPROP_SOURCE_VAL_RECVD);
 					}
 				}
 				break;
 			}
 		}
 
 		/*
 		 * If we found an explicit inheritance entry, err is zero even
 		 * though we haven't yet found the value, so reinitializing err
 		 * at the end of the loop (instead of at the beginning) ensures
 		 * that err has a valid post-loop value.
 		 */
 		err = SET_ERROR(ENOENT);
 	}
 
 	if (err == ENOENT)
 		err = dodefault(propname, intsz, numints, buf);
 
 	strfree(inheritstr);
 	strfree(recvdstr);
 
 	return (err);
 }
 
 int
 dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
     int intsz, int numints, void *buf, char *setpoint)
 {
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	boolean_t inheritable;
 	boolean_t snapshot;
 	uint64_t zapobj;
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
 	snapshot = dsl_dataset_is_snapshot(ds);
-	zapobj = ds->ds_phys->ds_props_obj;
+	zapobj = dsl_dataset_phys(ds)->ds_props_obj;
 
 	if (zapobj != 0) {
 		objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 		int err;
 
 		ASSERT(snapshot);
 
 		/* Check for a local value. */
 		err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
 		if (err != ENOENT) {
 			if (setpoint != NULL && err == 0)
 				dsl_dataset_name(ds, setpoint);
 			return (err);
 		}
 
 		/*
 		 * Skip the check for a received value if there is an explicit
 		 * inheritance entry.
 		 */
 		if (inheritable) {
 			char *inheritstr = kmem_asprintf("%s%s", propname,
 			    ZPROP_INHERIT_SUFFIX);
 			err = zap_contains(mos, zapobj, inheritstr);
 			strfree(inheritstr);
 			if (err != 0 && err != ENOENT)
 				return (err);
 		}
 
 		if (err == ENOENT) {
 			/* Check for a received value. */
 			char *recvdstr = kmem_asprintf("%s%s", propname,
 			    ZPROP_RECVD_SUFFIX);
 			err = zap_lookup(mos, zapobj, recvdstr,
 			    intsz, numints, buf);
 			strfree(recvdstr);
 			if (err != ENOENT) {
 				if (setpoint != NULL && err == 0)
 					(void) strcpy(setpoint,
 					    ZPROP_SOURCE_VAL_RECVD);
 				return (err);
 			}
 		}
 	}
 
 	return (dsl_prop_get_dd(ds->ds_dir, propname,
 	    intsz, numints, buf, setpoint, snapshot));
 }
 
 /*
  * Register interest in the named property.  We'll call the callback
  * once to notify it of the current property value, and again each time
  * the property changes, until this callback is unregistered.
  *
  * Return 0 on success, errno if the prop is not an integer value.
  */
 int
 dsl_prop_register(dsl_dataset_t *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg)
 {
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_pool_t *dp = dd->dd_pool;
 	uint64_t value;
 	dsl_prop_cb_record_t *cbr;
 	int err;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dsl_prop_get_int_ds(ds, propname, &value);
 	if (err != 0)
 		return (err);
 
 	cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
 	cbr->cbr_ds = ds;
 	cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
 	(void) strcpy((char *)cbr->cbr_propname, propname);
 	cbr->cbr_func = callback;
 	cbr->cbr_arg = cbarg;
 	mutex_enter(&dd->dd_lock);
 	list_insert_head(&dd->dd_prop_cbs, cbr);
 	mutex_exit(&dd->dd_lock);
 
 	cbr->cbr_func(cbr->cbr_arg, value);
 	return (0);
 }
 
 int
 dsl_prop_get(const char *dsname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold(dsname, FTAG, &os);
 	if (error != 0)
 		return (error);
 
 	error = dsl_prop_get_ds(dmu_objset_ds(os), propname,
 	    intsz, numints, buf, setpoint);
 
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 /*
  * Get the current property value.  It may have changed by the time this
  * function returns, so it is NOT safe to follow up with
  * dsl_prop_register() and assume that the value has not changed in
  * between.
  *
  * Return 0 on success, ENOENT if ddname is invalid.
  */
 int
 dsl_prop_get_integer(const char *ddname, const char *propname,
     uint64_t *valuep, char *setpoint)
 {
 	return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
 }
 
 int
 dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname,
     uint64_t *valuep)
 {
 	return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL));
 }
 
 /*
  * Predict the effective value of the given special property if it were set with
  * the given value and source. This is not a general purpose function. It exists
  * only to handle the special requirements of the quota and reservation
  * properties. The fact that these properties are non-inheritable greatly
  * simplifies the prediction logic.
  *
  * Returns 0 on success, a positive error code on failure, or -1 if called with
  * a property not handled by this function.
  */
 int
 dsl_prop_predict(dsl_dir_t *dd, const char *propname,
     zprop_source_t source, uint64_t value, uint64_t *newvalp)
 {
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	objset_t *mos;
 	uint64_t zapobj;
 	uint64_t version;
 	char *recvdstr;
 	int err = 0;
 
 	switch (prop) {
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_RESERVATION:
 	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_REFRESERVATION:
 		break;
 	default:
 		return (-1);
 	}
 
 	mos = dd->dd_pool->dp_meta_objset;
-	zapobj = dd->dd_phys->dd_props_zapobj;
+	zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
 	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
 
 	version = spa_version(dd->dd_pool->dp_spa);
 	if (version < SPA_VERSION_RECVD_PROPS) {
 		if (source & ZPROP_SRC_NONE)
 			source = ZPROP_SRC_NONE;
 		else if (source & ZPROP_SRC_RECEIVED)
 			source = ZPROP_SRC_LOCAL;
 	}
 
 	switch (source) {
 	case ZPROP_SRC_NONE:
 		/* Revert to the received value, if any. */
 		err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp);
 		if (err == ENOENT)
 			*newvalp = 0;
 		break;
 	case ZPROP_SRC_LOCAL:
 		*newvalp = value;
 		break;
 	case ZPROP_SRC_RECEIVED:
 		/*
 		 * If there's no local setting, then the new received value will
 		 * be the effective value.
 		 */
 		err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
 		if (err == ENOENT)
 			*newvalp = value;
 		break;
 	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
 		/*
 		 * We're clearing the received value, so the local setting (if
 		 * it exists) remains the effective value.
 		 */
 		err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
 		if (err == ENOENT)
 			*newvalp = 0;
 		break;
 	default:
 		panic("unexpected property source: %d", source);
 	}
 
 	strfree(recvdstr);
 
 	if (err == ENOENT)
 		return (0);
 
 	return (err);
 }
 
 /*
  * Unregister this callback.  Return 0 on success, ENOENT if ddname is
  * invalid, or ENOMSG if no matching callback registered.
  */
 int
 dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg)
 {
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_prop_cb_record_t *cbr;
 
 	mutex_enter(&dd->dd_lock);
 	for (cbr = list_head(&dd->dd_prop_cbs);
 	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
 		if (cbr->cbr_ds == ds &&
 		    cbr->cbr_func == callback &&
 		    cbr->cbr_arg == cbarg &&
 		    strcmp(cbr->cbr_propname, propname) == 0)
 			break;
 	}
 
 	if (cbr == NULL) {
 		mutex_exit(&dd->dd_lock);
 		return (SET_ERROR(ENOMSG));
 	}
 
 	list_remove(&dd->dd_prop_cbs, cbr);
 	mutex_exit(&dd->dd_lock);
 	kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
 	kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
 
 	return (0);
 }
 
 boolean_t
 dsl_prop_hascb(dsl_dataset_t *ds)
 {
 	dsl_dir_t *dd = ds->ds_dir;
 	boolean_t rv = B_FALSE;
 	dsl_prop_cb_record_t *cbr;
 
 	mutex_enter(&dd->dd_lock);
 	for (cbr = list_head(&dd->dd_prop_cbs); cbr;
 	    cbr = list_next(&dd->dd_prop_cbs, cbr)) {
 		if (cbr->cbr_ds == ds) {
 			rv = B_TRUE;
 			break;
 		}
 	}
 	mutex_exit(&dd->dd_lock);
 	return (rv);
 }
 
 /* ARGSUSED */
 static int
 dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_prop_cb_record_t *cbr;
 
 	mutex_enter(&dd->dd_lock);
 	for (cbr = list_head(&dd->dd_prop_cbs); cbr;
 	    cbr = list_next(&dd->dd_prop_cbs, cbr)) {
 		uint64_t value;
 
 		if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname,
 		    sizeof (value), 1, &value, NULL) == 0)
 			cbr->cbr_func(cbr->cbr_arg, value);
 	}
 	mutex_exit(&dd->dd_lock);
 
 	return (0);
 }
 
 /*
  * Update all property values for ddobj & its descendants.  This is used
  * when renaming the dir.
  */
 void
 dsl_prop_notify_all(dsl_dir_t *dd)
 {
 	dsl_pool_t *dp = dd->dd_pool;
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 	(void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb,
 	    NULL, DS_FIND_CHILDREN);
 }
 
 static void
 dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
     const char *propname, uint64_t value, int first)
 {
 	dsl_dir_t *dd;
 	dsl_prop_cb_record_t *cbr;
 	objset_t *mos = dp->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t *za;
 	int err;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 	err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
 	if (err)
 		return;
 
 	if (!first) {
 		/*
 		 * If the prop is set here, then this change is not
 		 * being inherited here or below; stop the recursion.
 		 */
-		err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
+		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    propname);
 		if (err == 0) {
 			dsl_dir_rele(dd, FTAG);
 			return;
 		}
 		ASSERT3U(err, ==, ENOENT);
 	}
 
 	mutex_enter(&dd->dd_lock);
 	for (cbr = list_head(&dd->dd_prop_cbs); cbr;
 	    cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj;
+		uint64_t propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
 
 		if (strcmp(cbr->cbr_propname, propname) != 0)
 			continue;
 
 		/*
 		 * If the property is set on this ds, then it is not
 		 * inherited here; don't call the callback.
 		 */
 		if (propobj && 0 == zap_contains(mos, propobj, propname))
 			continue;
 
 		cbr->cbr_func(cbr->cbr_arg, value);
 	}
 	mutex_exit(&dd->dd_lock);
 
 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 	for (zap_cursor_init(&zc, mos,
-	    dd->dd_phys->dd_child_dir_zapobj);
+	    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_prop_changed_notify(dp, za->za_first_integer,
 		    propname, value, FALSE);
 	}
 	kmem_free(za, sizeof (zap_attribute_t));
 	zap_cursor_fini(&zc);
 	dsl_dir_rele(dd, FTAG);
 }
 
 void
 dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
     zprop_source_t source, int intsz, int numints, const void *value,
     dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t zapobj, intval, dummy;
 	int isint;
 	char valbuf[32];
 	const char *valstr = NULL;
 	char *inheritstr;
 	char *recvdstr;
 	char *tbuf = NULL;
 	int err;
 	uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
 
 	isint = (dodefault(propname, 8, 1, &intval) == 0);
 
 	if (dsl_dataset_is_snapshot(ds)) {
 		ASSERT(version >= SPA_VERSION_SNAP_PROPS);
-		if (ds->ds_phys->ds_props_obj == 0) {
+		if (dsl_dataset_phys(ds)->ds_props_obj == 0) {
 			dmu_buf_will_dirty(ds->ds_dbuf, tx);
-			ds->ds_phys->ds_props_obj =
+			dsl_dataset_phys(ds)->ds_props_obj =
 			    zap_create(mos,
 			    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 		}
-		zapobj = ds->ds_phys->ds_props_obj;
+		zapobj = dsl_dataset_phys(ds)->ds_props_obj;
 	} else {
-		zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
+		zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
 	}
 
 	if (version < SPA_VERSION_RECVD_PROPS) {
 		if (source & ZPROP_SRC_NONE)
 			source = ZPROP_SRC_NONE;
 		else if (source & ZPROP_SRC_RECEIVED)
 			source = ZPROP_SRC_LOCAL;
 	}
 
 	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
 	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
 
 	switch (source) {
 	case ZPROP_SRC_NONE:
 		/*
 		 * revert to received value, if any (inherit -S)
 		 * - remove propname
 		 * - remove propname$inherit
 		 */
 		err = zap_remove(mos, zapobj, propname, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		err = zap_remove(mos, zapobj, inheritstr, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		break;
 	case ZPROP_SRC_LOCAL:
 		/*
 		 * remove propname$inherit
 		 * set propname -> value
 		 */
 		err = zap_remove(mos, zapobj, inheritstr, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		VERIFY0(zap_update(mos, zapobj, propname,
 		    intsz, numints, value, tx));
 		break;
 	case ZPROP_SRC_INHERITED:
 		/*
 		 * explicitly inherit
 		 * - remove propname
 		 * - set propname$inherit
 		 */
 		err = zap_remove(mos, zapobj, propname, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		if (version >= SPA_VERSION_RECVD_PROPS &&
 		    dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
 			dummy = 0;
 			VERIFY0(zap_update(mos, zapobj, inheritstr,
 			    8, 1, &dummy, tx));
 		}
 		break;
 	case ZPROP_SRC_RECEIVED:
 		/*
 		 * set propname$recvd -> value
 		 */
 		err = zap_update(mos, zapobj, recvdstr,
 		    intsz, numints, value, tx);
 		ASSERT(err == 0);
 		break;
 	case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
 		/*
 		 * clear local and received settings
 		 * - remove propname
 		 * - remove propname$inherit
 		 * - remove propname$recvd
 		 */
 		err = zap_remove(mos, zapobj, propname, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		err = zap_remove(mos, zapobj, inheritstr, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		/* FALLTHRU */
 	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
 		/*
 		 * remove propname$recvd
 		 */
 		err = zap_remove(mos, zapobj, recvdstr, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		break;
 	default:
 		cmn_err(CE_PANIC, "unexpected property source: %d", source);
 	}
 
 	strfree(inheritstr);
 	strfree(recvdstr);
 
 	if (isint) {
 		VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
 
 		if (dsl_dataset_is_snapshot(ds)) {
 			dsl_prop_cb_record_t *cbr;
 			/*
 			 * It's a snapshot; nothing can inherit this
 			 * property, so just look for callbacks on this
 			 * ds here.
 			 */
 			mutex_enter(&ds->ds_dir->dd_lock);
 			for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
 			    cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
 				if (cbr->cbr_ds == ds &&
 				    strcmp(cbr->cbr_propname, propname) == 0)
 					cbr->cbr_func(cbr->cbr_arg, intval);
 			}
 			mutex_exit(&ds->ds_dir->dd_lock);
 		} else {
 			dsl_prop_changed_notify(ds->ds_dir->dd_pool,
 			    ds->ds_dir->dd_object, propname, intval, TRUE);
 		}
 
 		(void) snprintf(valbuf, sizeof (valbuf),
 		    "%lld", (longlong_t)intval);
 		valstr = valbuf;
 	} else {
 		if (source == ZPROP_SRC_LOCAL) {
 			valstr = value;
 		} else {
 			tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
 			if (dsl_prop_get_ds(ds, propname, 1,
 			    ZAP_MAXVALUELEN, tbuf, NULL) == 0)
 				valstr = tbuf;
 		}
 	}
 
 	spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE ||
 	    source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx,
 	    "%s=%s", propname, (valstr == NULL ? "" : valstr));
 
 	if (tbuf != NULL)
 		kmem_free(tbuf, ZAP_MAXVALUELEN);
 }
 
 int
 dsl_prop_set_int(const char *dsname, const char *propname,
     zprop_source_t source, uint64_t value)
 {
 	nvlist_t *nvl = fnvlist_alloc();
 	int error;
 
 	fnvlist_add_uint64(nvl, propname, value);
 	error = dsl_props_set(dsname, source, nvl);
 	fnvlist_free(nvl);
 	return (error);
 }
 
 int
 dsl_prop_set_string(const char *dsname, const char *propname,
     zprop_source_t source, const char *value)
 {
 	nvlist_t *nvl = fnvlist_alloc();
 	int error;
 
 	fnvlist_add_string(nvl, propname, value);
 	error = dsl_props_set(dsname, source, nvl);
 	fnvlist_free(nvl);
 	return (error);
 }
 
 int
 dsl_prop_inherit(const char *dsname, const char *propname,
     zprop_source_t source)
 {
 	nvlist_t *nvl = fnvlist_alloc();
 	int error;
 
 	fnvlist_add_boolean(nvl, propname);
 	error = dsl_props_set(dsname, source, nvl);
 	fnvlist_free(nvl);
 	return (error);
 }
 
 typedef struct dsl_props_set_arg {
 	const char *dpsa_dsname;
 	zprop_source_t dpsa_source;
 	nvlist_t *dpsa_props;
 } dsl_props_set_arg_t;
 
 static int
 dsl_props_set_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_props_set_arg_t *dpsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t version;
 	nvpair_t *elem = NULL;
 	int err;
 
 	err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds);
 	if (err != 0)
 		return (err);
 
 	version = spa_version(ds->ds_dir->dd_pool->dp_spa);
 	while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) {
 		if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(ENAMETOOLONG));
 		}
 		if (nvpair_type(elem) == DATA_TYPE_STRING) {
 			char *valstr = fnvpair_value_string(elem);
 			if (strlen(valstr) >= (version <
 			    SPA_VERSION_STMF_PROP ?
 			    ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
 				dsl_dataset_rele(ds, FTAG);
 				return (E2BIG);
 			}
 		}
 	}
 
 	if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source,
     nvlist_t *props, dmu_tx_t *tx)
 {
 	nvpair_t *elem = NULL;
 
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		nvpair_t *pair = elem;
 
 		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 			/*
 			 * dsl_prop_get_all_impl() returns properties in this
 			 * format.
 			 */
 			nvlist_t *attrs = fnvpair_value_nvlist(pair);
 			pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
 		}
 
 		if (nvpair_type(pair) == DATA_TYPE_STRING) {
 			const char *value = fnvpair_value_string(pair);
 			dsl_prop_set_sync_impl(ds, nvpair_name(pair),
 			    source, 1, strlen(value) + 1, value, tx);
 		} else if (nvpair_type(pair) == DATA_TYPE_UINT64) {
 			uint64_t intval = fnvpair_value_uint64(pair);
 			dsl_prop_set_sync_impl(ds, nvpair_name(pair),
 			    source, sizeof (intval), 1, &intval, tx);
 		} else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) {
 			dsl_prop_set_sync_impl(ds, nvpair_name(pair),
 			    source, 0, 0, NULL, tx);
 		} else {
 			panic("invalid nvpair type");
 		}
 	}
 }
 
 static void
 dsl_props_set_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_props_set_arg_t *dpsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds));
 	dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * All-or-nothing; if any prop can't be set, nothing will be modified.
  */
 int
 dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
 {
 	dsl_props_set_arg_t dpsa;
 	int nblks = 0;
 
 	dpsa.dpsa_dsname = dsname;
 	dpsa.dpsa_source = source;
 	dpsa.dpsa_props = props;
 
 	/*
 	 * If the source includes NONE, then we will only be removing entries
 	 * from the ZAP object.  In that case don't check for ENOSPC.
 	 */
 	if ((source & ZPROP_SRC_NONE) == 0)
 		nblks = 2 * fnvlist_num_pairs(props);
 
 	return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
 	    &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
 }
 
 typedef enum dsl_prop_getflags {
 	DSL_PROP_GET_INHERITING = 0x1,	/* searching parent of target ds */
 	DSL_PROP_GET_SNAPSHOT = 0x2,	/* snapshot dataset */
 	DSL_PROP_GET_LOCAL = 0x4,	/* local properties */
 	DSL_PROP_GET_RECEIVED = 0x8	/* received properties */
 } dsl_prop_getflags_t;
 
 static int
 dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
     const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int err = 0;
 
 	for (zap_cursor_init(&zc, mos, propobj);
 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		nvlist_t *propval;
 		zfs_prop_t prop;
 		char buf[ZAP_MAXNAMELEN];
 		char *valstr;
 		const char *suffix;
 		const char *propname;
 		const char *source;
 
 		suffix = strchr(za.za_name, '$');
 
 		if (suffix == NULL) {
 			/*
 			 * Skip local properties if we only want received
 			 * properties.
 			 */
 			if (flags & DSL_PROP_GET_RECEIVED)
 				continue;
 
 			propname = za.za_name;
 			source = setpoint;
 		} else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
 			/* Skip explicitly inherited entries. */
 			continue;
 		} else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
 			if (flags & DSL_PROP_GET_LOCAL)
 				continue;
 
 			(void) strncpy(buf, za.za_name, (suffix - za.za_name));
 			buf[suffix - za.za_name] = '\0';
 			propname = buf;
 
 			if (!(flags & DSL_PROP_GET_RECEIVED)) {
 				/* Skip if locally overridden. */
 				err = zap_contains(mos, propobj, propname);
 				if (err == 0)
 					continue;
 				if (err != ENOENT)
 					break;
 
 				/* Skip if explicitly inherited. */
 				valstr = kmem_asprintf("%s%s", propname,
 				    ZPROP_INHERIT_SUFFIX);
 				err = zap_contains(mos, propobj, valstr);
 				strfree(valstr);
 				if (err == 0)
 					continue;
 				if (err != ENOENT)
 					break;
 			}
 
 			source = ((flags & DSL_PROP_GET_INHERITING) ?
 			    setpoint : ZPROP_SOURCE_VAL_RECVD);
 		} else {
 			/*
 			 * For backward compatibility, skip suffixes we don't
 			 * recognize.
 			 */
 			continue;
 		}
 
 		prop = zfs_name_to_prop(propname);
 
 		/* Skip non-inheritable properties. */
 		if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
 		    !zfs_prop_inheritable(prop))
 			continue;
 
 		/* Skip properties not valid for this type. */
 		if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
 		    !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
 			continue;
 
 		/* Skip properties already defined. */
 		if (nvlist_exists(nv, propname))
 			continue;
 
 		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		if (za.za_integer_length == 1) {
 			/*
 			 * String property
 			 */
 			char *tmp = kmem_alloc(za.za_num_integers,
 			    KM_SLEEP);
 			err = zap_lookup(mos, propobj,
 			    za.za_name, 1, za.za_num_integers, tmp);
 			if (err != 0) {
 				kmem_free(tmp, za.za_num_integers);
 				break;
 			}
 			VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
 			    tmp) == 0);
 			kmem_free(tmp, za.za_num_integers);
 		} else {
 			/*
 			 * Integer property
 			 */
 			ASSERT(za.za_integer_length == 8);
 			(void) nvlist_add_uint64(propval, ZPROP_VALUE,
 			    za.za_first_integer);
 		}
 
 		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
 		VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
 		nvlist_free(propval);
 	}
 	zap_cursor_fini(&zc);
 	if (err == ENOENT)
 		err = 0;
 	return (err);
 }
 
 /*
  * Iterate over all properties for this dataset and return them in an nvlist.
  */
 static int
 dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
     dsl_prop_getflags_t flags)
 {
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_pool_t *dp = dd->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	int err = 0;
 	char setpoint[MAXNAMELEN];
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	if (dsl_dataset_is_snapshot(ds))
 		flags |= DSL_PROP_GET_SNAPSHOT;
 
 	ASSERT(dsl_pool_config_held(dp));
 
-	if (ds->ds_phys->ds_props_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
 		ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
 		dsl_dataset_name(ds, setpoint);
-		err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj,
-		    setpoint, flags, *nvp);
+		err = dsl_prop_get_all_impl(mos,
+		    dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
 		if (err)
 			goto out;
 	}
 
 	for (; dd != NULL; dd = dd->dd_parent) {
 		if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) {
 			if (flags & (DSL_PROP_GET_LOCAL |
 			    DSL_PROP_GET_RECEIVED))
 				break;
 			flags |= DSL_PROP_GET_INHERITING;
 		}
 		dsl_dir_name(dd, setpoint);
-		err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj,
-		    setpoint, flags, *nvp);
+		err = dsl_prop_get_all_impl(mos,
+		    dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
 		if (err)
 			break;
 	}
 out:
 	return (err);
 }
 
 boolean_t
 dsl_prop_get_hasrecvd(const char *dsname)
 {
 	uint64_t dummy;
 
 	return (0 ==
 	    dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL));
 }
 
 static int
 dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source)
 {
 	uint64_t version;
 	spa_t *spa;
 	int error = 0;
 
 	VERIFY0(spa_open(dsname, &spa, FTAG));
 	version = spa_version(spa);
 	spa_close(spa, FTAG);
 
 	if (version >= SPA_VERSION_RECVD_PROPS)
 		error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0);
 	return (error);
 }
 
 /*
  * Call after successfully receiving properties to ensure that only the first
  * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
  */
 int
 dsl_prop_set_hasrecvd(const char *dsname)
 {
 	int error = 0;
 	if (!dsl_prop_get_hasrecvd(dsname))
 		error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL);
 	return (error);
 }
 
 void
 dsl_prop_unset_hasrecvd(const char *dsname)
 {
 	VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE));
 }
 
 int
 dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
 {
 	return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
 }
 
 int
 dsl_prop_get_received(const char *dsname, nvlist_t **nvp)
 {
 	objset_t *os;
 	int error;
 
 	/*
 	 * Received properties are not distinguishable from local properties
 	 * until the dataset has received properties on or after
 	 * SPA_VERSION_RECVD_PROPS.
 	 */
 	dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ?
 	    DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
 
 	error = dmu_objset_hold(dsname, FTAG, &os);
 	if (error != 0)
 		return (error);
 	error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags);
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 void
 dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
 {
 	nvlist_t *propval;
 	const char *propname = zfs_prop_to_name(prop);
 	uint64_t default_value;
 
 	if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
 		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
 		return;
 	}
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
 	/* Indicate the default source if we can. */
 	if (dodefault(propname, 8, 1, &default_value) == 0 &&
 	    value == default_value) {
 		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
 	}
 	VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
 	nvlist_free(propval);
 }
 
 void
 dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
 {
 	nvlist_t *propval;
 	const char *propname = zfs_prop_to_name(prop);
 
 	if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
 		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
 		return;
 	}
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
 	VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
 	nvlist_free(propval);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	(revision 275782)
@@ -1,1831 +1,1835 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_scan.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zil_impl.h>
 #include <sys/zio_checksum.h>
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
     const zbookmark_phys_t *);
 
 static scan_cb_t dsl_scan_scrub_cb;
 static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
 
 unsigned int zfs_top_maxinflight = 32;	/* maximum I/Os per top-level */
 unsigned int zfs_resilver_delay = 2;	/* number of ticks to delay resilver */
 unsigned int zfs_scrub_delay = 4;	/* number of ticks to delay scrub */
 unsigned int zfs_scan_idle = 50;	/* idle window in clock ticks */
 
 unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
 unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
 unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
 						 per txg */
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
     &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev");
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN,
     &zfs_resilver_delay, 0, "Number of ticks to delay resilver");
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN,
     &zfs_scrub_delay, 0, "Number of ticks to delay scrub");
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN,
     &zfs_scan_idle, 0, "Idle scan window in clock ticks");
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
     &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN,
     &zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN,
     &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN,
     &zfs_no_scrub_io, 0, "Disable scrub I/O");
 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN,
     &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
 
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 /* max number of blocks to free in a single TXG */
 uint64_t zfs_free_max_blocks = UINT64_MAX;
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
     &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG");
 
 
 #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
 
 extern int zfs_txg_timeout;
 
 /* the order has to match pool_scan_type */
 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
 	NULL,
 	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
 	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
 };
 
 int
 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 {
 	int err;
 	dsl_scan_t *scn;
 	spa_t *spa = dp->dp_spa;
 	uint64_t f;
 
 	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
 	scn->scn_dp = dp;
 
 	/*
 	 * It's possible that we're resuming a scan after a reboot so
 	 * make sure that the scan_async_destroying flag is initialized
 	 * appropriately.
 	 */
 	ASSERT(!scn->scn_async_destroying);
 	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
 	    SPA_FEATURE_ASYNC_DESTROY);
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    "scrub_func", sizeof (uint64_t), 1, &f);
 	if (err == 0) {
 		/*
 		 * There was an old-style scrub in progress.  Restart a
 		 * new-style scrub from the beginning.
 		 */
 		scn->scn_restart_txg = txg;
 		zfs_dbgmsg("old-style scrub was in progress; "
 		    "restarting new-style scrub in txg %llu",
 		    scn->scn_restart_txg);
 
 		/*
 		 * Load the queue obj from the old location so that it
 		 * can be freed by dsl_scan_done().
 		 */
 		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    "scrub_queue", sizeof (uint64_t), 1,
 		    &scn->scn_phys.scn_queue_obj);
 	} else {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys);
 		if (err == ENOENT)
 			return (0);
 		else if (err)
 			return (err);
 
 		if (scn->scn_phys.scn_state == DSS_SCANNING &&
 		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
 			/*
 			 * A new-type scrub was in progress on an old
 			 * pool, and the pool was accessed by old
 			 * software.  Restart from the beginning, since
 			 * the old software may have changed the pool in
 			 * the meantime.
 			 */
 			scn->scn_restart_txg = txg;
 			zfs_dbgmsg("new-style scrub was modified "
 			    "by old software; restarting in txg %llu",
 			    scn->scn_restart_txg);
 		}
 	}
 
 	spa_scan_stat_init(spa);
 	return (0);
 }
 
 void
 dsl_scan_fini(dsl_pool_t *dp)
 {
 	if (dp->dp_scan) {
 		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
 		dp->dp_scan = NULL;
 	}
 }
 
 /* ARGSUSED */
 static int
 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	if (scn->scn_phys.scn_state == DSS_SCANNING)
 		return (SET_ERROR(EBUSY));
 
 	return (0);
 }
 
 static void
 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	pool_scan_func_t *funcp = arg;
 	dmu_object_type_t ot = 0;
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
 	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
 	scn->scn_phys.scn_func = *funcp;
 	scn->scn_phys.scn_state = DSS_SCANNING;
 	scn->scn_phys.scn_min_txg = 0;
 	scn->scn_phys.scn_max_txg = tx->tx_txg;
 	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
 	scn->scn_phys.scn_start_time = gethrestime_sec();
 	scn->scn_phys.scn_errors = 0;
 	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
 	scn->scn_restart_txg = 0;
 	scn->scn_done_txg = 0;
 	spa_scan_stat_init(spa);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
 
 		/* rewrite all disk labels */
 		vdev_config_dirty(spa->spa_root_vdev);
 
 		if (vdev_resilver_needed(spa->spa_root_vdev,
 		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
 			spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
 		} else {
 			spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
 		}
 
 		spa->spa_scrub_started = B_TRUE;
 		/*
 		 * If this is an incremental scrub, limit the DDT scrub phase
 		 * to just the auto-ditto class (for correctness); the rest
 		 * of the scrub should go faster using top-down pruning.
 		 */
 		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
 			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
 
 	}
 
 	/* back to the generic stuff */
 
 	if (dp->dp_blkstats == NULL) {
 		dp->dp_blkstats =
 		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
 	}
 	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 
 	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
 		ot = DMU_OT_ZAP_OTHER;
 
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
 	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
 
 	dsl_scan_sync_state(scn, tx);
 
 	spa_history_log_internal(spa, "scan setup", tx,
 	    "func=%u mintxg=%llu maxtxg=%llu",
 	    *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
 }
 
 /* ARGSUSED */
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
 	static const char *old_names[] = {
 		"scrub_bookmark",
 		"scrub_ddt_bookmark",
 		"scrub_ddt_class_max",
 		"scrub_queue",
 		"scrub_min_txg",
 		"scrub_max_txg",
 		"scrub_func",
 		"scrub_errors",
 		NULL
 	};
 
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	int i;
 
 	/* Remove any remnants of an old-style scrub. */
 	for (i = 0; old_names[i]; i++) {
 		(void) zap_remove(dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
 	}
 
 	if (scn->scn_phys.scn_queue_obj != 0) {
 		VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, tx));
 		scn->scn_phys.scn_queue_obj = 0;
 	}
 
 	/*
 	 * If we were "restarted" from a stopped state, don't bother
 	 * with anything else.
 	 */
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	if (complete)
 		scn->scn_phys.scn_state = DSS_FINISHED;
 	else
 		scn->scn_phys.scn_state = DSS_CANCELED;
 
 	spa_history_log_internal(spa, "scan done", tx,
 	    "complete=%u", complete);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight > 0) {
 			cv_wait(&spa->spa_scrub_io_cv,
 			    &spa->spa_scrub_lock);
 		}
 		mutex_exit(&spa->spa_scrub_lock);
 		spa->spa_scrub_started = B_FALSE;
 		spa->spa_scrub_active = B_FALSE;
 
 		/*
 		 * If the scrub/resilver completed, update all DTLs to
 		 * reflect this.  Whether it succeeded or not, vacate
 		 * all temporary scrub DTLs.
 		 */
 		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
 		if (complete) {
 			spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
 			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
 		}
 		spa_errlog_rotate(spa);
 
 		/*
 		 * We may have finished replacing a device.
 		 * Let the async thread assess this and handle the detach.
 		 */
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 	}
 
 	scn->scn_phys.scn_end_time = gethrestime_sec();
 }
 
 /* ARGSUSED */
 static int
 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return (SET_ERROR(ENOENT));
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	dsl_scan_done(scn, B_FALSE, tx);
 	dsl_scan_sync_state(scn, tx);
 }
 
 int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
 	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
 	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx);
 static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
     dmu_objset_type_t ostype,
     dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
 
 void
 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
 {
 	zio_free(dp->dp_spa, txg, bp);
 }
 
 void
 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
 {
 	ASSERT(dsl_pool_sync_context(dp));
 	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
 	    pio->io_flags));
 }
 
 static uint64_t
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
 	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
 	if (dsl_dataset_is_snapshot(ds))
-		return (MIN(smt, ds->ds_phys->ds_creation_txg));
+		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
 	return (smt);
 }
 
 static void
 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 	    &scn->scn_phys, tx));
 }
 
 extern int zfs_vdev_async_write_active_min_dirty_percent;
 
 static boolean_t
 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 {
 	/* we never skip user/group accounting objects */
 	if (zb && (int64_t)zb->zb_object < 0)
 		return (B_FALSE);
 
 	if (scn->scn_pausing)
 		return (B_TRUE); /* we're already pausing */
 
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
 		return (B_FALSE); /* we're resuming */
 
 	/* We only know how to resume from level-0 blocks. */
 	if (zb && zb->zb_level != 0)
 		return (B_FALSE);
 
 	/*
 	 * We pause if:
 	 *  - we have scanned for the maximum time: an entire txg
 	 *    timeout (default 5 sec)
 	 *  or
 	 *  - we have scanned for at least the minimum time (default 1 sec
 	 *    for scrub, 3 sec for resilver), and either we have sufficient
 	 *    dirty data that we are starting to write more quickly
 	 *    (default 30%), or someone is explicitly waiting for this txg
 	 *    to complete.
 	 *  or
 	 *  - the spa is shutting down because this pool is being exported
 	 *    or the machine is rebooting.
 	 */
 	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
 	uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
 	if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
 	    (NSEC2MSEC(elapsed_nanosecs) > mintime &&
 	    (txg_sync_waiting(scn->scn_dp) ||
 	    dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa)) {
 		if (zb) {
 			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			scn->scn_phys.scn_bookmark = *zb;
 		}
 		dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
 		scn->scn_pausing = B_TRUE;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 typedef struct zil_scan_arg {
 	dsl_pool_t	*zsa_dp;
 	zil_header_t	*zsa_zh;
 } zil_scan_arg_t;
 
 /* ARGSUSED */
 static int
 dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	zil_scan_arg_t *zsa = arg;
 	dsl_pool_t *dp = zsa->zsa_dp;
 	dsl_scan_t *scn = dp->dp_scan;
 	zil_header_t *zh = zsa->zsa_zh;
 	zbookmark_phys_t zb;
 
 	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
 	 * One block ("stubby") can be allocated a long time ago; we
 	 * want to visit that one because it has been allocated
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
 	if (lrc->lrc_txtype == TX_WRITE) {
 		zil_scan_arg_t *zsa = arg;
 		dsl_pool_t *dp = zsa->zsa_dp;
 		dsl_scan_t *scn = dp->dp_scan;
 		zil_header_t *zh = zsa->zsa_zh;
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
 		zbookmark_phys_t zb;
 
 		if (BP_IS_HOLE(bp) ||
 		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
 		 * birth can be < claim_txg if this record's txg is
 		 * already txg sync'ed (but this log block contains
 		 * other records that are not synced)
 		 */
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
 			return (0);
 
 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
 
 		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	}
 	return (0);
 }
 
 static void
 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 	zil_scan_arg_t zsa = { dp, zh };
 	zilog_t *zilog;
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
 	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
 		return;
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
 
 	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
 	    claim_txg);
 
 	zil_free(zilog);
 }
 
 /* ARGSUSED */
 static void
 dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
     uint64_t objset, uint64_t object, uint64_t blkid)
 {
 	zbookmark_phys_t czb;
 	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
 
 	if (zfs_no_scrub_prefetch)
 		return;
 
 	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
 		return;
 
 	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
 
 	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
 	    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
 }
 
 static boolean_t
 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
     const zbookmark_phys_t *zb)
 {
 	/*
 	 * We never skip over user/group accounting objects (obj<0)
 	 */
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
 	    (int64_t)zb->zb_object >= 0) {
 		/*
 		 * If we already visited this bp & everything below (in
 		 * a prior txg sync), don't bother doing it again.
 		 */
 		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
 			return (B_TRUE);
 
 		/*
 		 * If we found the block we're trying to resume from, or
 		 * we went past it to a different object, zero it out to
 		 * indicate that it's OK to start checking for pausing
 		 * again.
 		 */
 		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
 		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
 		}
 	}
 	return (B_FALSE);
 }
 
 /*
  * Return nonzero on i/o error.
  * Return new buf to write out in *bufp.
  */
 static int
 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 	int err;
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		uint32_t flags = ARC_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
 			    zb->zb_object, zb->zb_blkid * epb + i);
 		}
 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			dsl_scan_visitbp(cbp, &czb, dnp,
 			    ds, scn, ostype, tx);
 		}
 		(void) arc_buf_remove_ref(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
 		dnode_phys_t *cdnp;
 		int i, j;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
 			for (j = 0; j < cdnp->dn_nblkptr; j++) {
 				blkptr_t *cbp = &cdnp->dn_blkptr[j];
 				dsl_scan_prefetch(scn, buf, cbp,
 				    zb->zb_objset, zb->zb_blkid * epb + i, j);
 			}
 		}
 		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
 			dsl_scan_visitdnode(scn, ds, ostype,
 			    cdnp, zb->zb_blkid * epb + i, tx);
 		}
 
 		(void) arc_buf_remove_ref(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 
 		osp = buf->b_data;
 
 		dsl_scan_visitdnode(scn, ds, osp->os_type,
 		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			/*
 			 * We also always visit user/group accounting
 			 * objects, and never skip them, even if we are
 			 * pausing.  This is necessary so that the space
 			 * deltas from this txg get integrated.
 			 */
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_groupused_dnode,
 			    DMU_GROUPUSED_OBJECT, tx);
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_userused_dnode,
 			    DMU_USERUSED_OBJECT, tx);
 		}
 		(void) arc_buf_remove_ref(buf, &buf);
 	}
 
 	return (0);
 }
 
 static void
 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
     dmu_objset_type_t ostype, dnode_phys_t *dnp,
     uint64_t object, dmu_tx_t *tx)
 {
 	int j;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		zbookmark_phys_t czb;
 
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    dnp->dn_nlevels - 1, j);
 		dsl_scan_visitbp(&dnp->dn_blkptr[j],
 		    &czb, dnp, ds, scn, ostype, tx);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		zbookmark_phys_t czb;
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    0, DMU_SPILL_BLKID);
 		dsl_scan_visitbp(&dnp->dn_spill,
 		    &czb, dnp, ds, scn, ostype, tx);
 	}
 }
 
 /*
  * The arguments are in this order because mdb can only print the
  * first 5; we want them to be useful.
  */
 static void
 dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	arc_buf_t *buf = NULL;
 	blkptr_t bp_toread = *bp;
 
 	/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
 
 	if (dsl_scan_check_pause(scn, zb))
 		return;
 
 	if (dsl_scan_check_resume(scn, dnp, zb))
 		return;
 
 	if (BP_IS_HOLE(bp))
 		return;
 
 	scn->scn_visited_this_txg++;
 
 	dprintf_bp(bp,
 	    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
 	    ds, ds ? ds->ds_object : 0,
 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
 	    bp);
 
 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 		return;
 
 	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
 		return;
 
 	/*
 	 * If dsl_scan_ddt() has aready visited this block, it will have
 	 * already done any translations or scrubbing, so don't call the
 	 * callback again.
 	 */
 	if (ddt_class_contains(dp->dp_spa,
 	    scn->scn_phys.scn_ddt_class_max, bp)) {
 		ASSERT(buf == NULL);
 		return;
 	}
 
 	/*
 	 * If this block is from the future (after cur_max_txg), then we
 	 * are doing this on behalf of a deleted snapshot, and we will
 	 * revisit the future block on the next pass of this dataset.
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
 	if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
 		scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
 	}
 }
 
 static void
 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_tx_t *tx)
 {
 	zbookmark_phys_t zb;
 
 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	dsl_scan_visitbp(bp, &zb, NULL,
 	    ds, scn, DMU_OST_NONE, tx);
 
 	dprintf_ds(ds, "finished scan%s", "");
 }
 
 void
 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
 		if (dsl_dataset_is_snapshot(ds)) {
 			/* Note, scn_cur_{min,max}_txg stays the same. */
 			scn->scn_phys.scn_bookmark.zb_objset =
-			    ds->ds_phys->ds_next_snap_obj;
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 			zfs_dbgmsg("destroying ds %llu; currently traversing; "
 			    "reset zb_objset to %llu",
 			    (u_longlong_t)ds->ds_object,
-			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
 			scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
 		} else {
 			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
 			    ZB_DESTROYED_OBJSET, 0, 0, 0);
 			zfs_dbgmsg("destroying ds %llu; currently traversing; "
 			    "reset bookmark to -1,0,0,0",
 			    (u_longlong_t)ds->ds_object);
 		}
 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
-		ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		if (dsl_dataset_is_snapshot(ds)) {
 			/*
 			 * We keep the same mintxg; it could be >
 			 * ds_creation_txg if the previous snapshot was
 			 * deleted too.
 			 */
 			VERIFY(zap_add_int_key(dp->dp_meta_objset,
 			    scn->scn_phys.scn_queue_obj,
-			    ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+			    dsl_dataset_phys(ds)->ds_next_snap_obj,
+			    mintxg, tx) == 0);
 			zfs_dbgmsg("destroying ds %llu; in queue; "
 			    "replacing with %llu",
 			    (u_longlong_t)ds->ds_object,
-			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
 		} else {
 			zfs_dbgmsg("destroying ds %llu; in queue; removing",
 			    (u_longlong_t)ds->ds_object);
 		}
 	} else {
 		zfs_dbgmsg("destroying ds %llu; ignoring",
 		    (u_longlong_t)ds->ds_object);
 	}
 
 	/*
 	 * dsl_scan_sync() should be called after this, and should sync
 	 * out our changed state, but just to be safe, do it here.
 	 */
 	dsl_scan_sync_state(scn, tx);
 }
 
 void
 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
-	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
 		scn->scn_phys.scn_bookmark.zb_objset =
-		    ds->ds_phys->ds_prev_snap_obj;
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds->ds_object,
-		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj,
-		    ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
 		zfs_dbgmsg("snapshotting ds %llu; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds->ds_object,
-		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	}
 	dsl_scan_sync_state(scn, tx);
 }
 
 void
 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
 		scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
 		scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    (u_longlong_t)ds1->ds_object);
 	}
 
 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds1->ds_object, &mintxg) == 0) {
 		int err;
 
-		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
-		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
 		err = zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
 		VERIFY(err == 0 || err == EEXIST);
 		if (err == EEXIST) {
 			/* Both were there to begin with */
 			VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
 			    scn->scn_phys.scn_queue_obj,
 			    ds1->ds_object, mintxg, tx));
 		}
 		zfs_dbgmsg("clone_swap ds %llu; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
-		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
-		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
 		VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
 		zfs_dbgmsg("clone_swap ds %llu; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    (u_longlong_t)ds1->ds_object);
 	}
 
 	dsl_scan_sync_state(scn, tx);
 }
 
 struct enqueue_clones_arg {
 	dmu_tx_t *tx;
 	uint64_t originobj;
 };
 
 /* ARGSUSED */
 static int
 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	struct enqueue_clones_arg *eca = arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
-	if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj)
+	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
 		return (0);
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
-	while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 
 		dsl_dataset_rele(ds, FTAG);
 		if (err)
 			return (err);
 		ds = prev;
 	}
 	VERIFY(zap_add_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds->ds_object,
-	    ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	dsl_dataset_t *ds;
 	objset_t *os;
 
 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 
 	if (dmu_objset_from_ds(ds, &os))
 		goto out;
 
 	/*
 	 * Only the ZIL in the head (non-snapshot) is valid.  Even though
 	 * snapshots can have ZIL block pointers (which may be the same
 	 * BP as in the head), they must be ignored.  So we traverse the
 	 * ZIL here, rather than in scan_recurse(), because the regular
 	 * snapshot block-sharing rules don't apply to it.
 	 */
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
 		dsl_scan_zil(dp, &os->os_zil_header);
 
 	/*
 	 * Iterate over the bps in this ds.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
 
 	char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
 	    "pausing=%u",
 	    (longlong_t)dsobj, dsname,
 	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
 	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
 	    (int)scn->scn_pausing);
 	kmem_free(dsname, ZFS_MAXNAMELEN);
 
 	if (scn->scn_pausing)
 		goto out;
 
 	/*
 	 * We've finished this pass over this dataset.
 	 */
 
 	/*
 	 * If we did not completely visit this dataset, do another pass.
 	 */
 	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
 		zfs_dbgmsg("incomplete pass; visiting again");
 		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object,
 		    scn->scn_phys.scn_cur_max_txg, tx) == 0);
 		goto out;
 	}
 
 	/*
 	 * Add descendent datasets to work queue.
 	 */
-	if (ds->ds_phys->ds_next_snap_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
-		    ds->ds_phys->ds_creation_txg, tx) == 0);
+		    scn->scn_phys.scn_queue_obj,
+		    dsl_dataset_phys(ds)->ds_next_snap_obj,
+		    dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
 	}
-	if (ds->ds_phys->ds_num_children > 1) {
+	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
 		boolean_t usenext = B_FALSE;
-		if (ds->ds_phys->ds_next_clones_obj != 0) {
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 			uint64_t count;
 			/*
 			 * A bug in a previous version of the code could
 			 * cause upgrade_clones_cb() to not set
 			 * ds_next_snap_obj when it should, leading to a
 			 * missing entry.  Therefore we can only use the
 			 * next_clones_obj when its count is correct.
 			 */
 			int err = zap_count(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj, &count);
+			    dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
 			if (err == 0 &&
-			    count == ds->ds_phys->ds_num_children - 1)
+			    count == dsl_dataset_phys(ds)->ds_num_children - 1)
 				usenext = B_TRUE;
 		}
 
 		if (usenext) {
 			VERIFY0(zap_join_key(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj,
 			    scn->scn_phys.scn_queue_obj,
-			    ds->ds_phys->ds_creation_txg, tx));
+			    dsl_dataset_phys(ds)->ds_creation_txg, tx));
 		} else {
 			struct enqueue_clones_arg eca;
 			eca.tx = tx;
 			eca.originobj = ds->ds_object;
 
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 			    enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
 		}
 	}
 
 out:
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /* ARGSUSED */
 static int
 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
-	while (ds->ds_phys->ds_prev_snap_obj != 0) {
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		dsl_dataset_t *prev;
-		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-		    FTAG, &prev);
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
 		/*
 		 * If this is a clone, we don't need to worry about it for now.
 		 */
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_dataset_rele(prev, FTAG);
 			return (0);
 		}
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
 	}
 
 	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
-	    ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+	    ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 /*
  * Scrub/dedup interaction.
  *
  * If there are N references to a deduped block, we don't want to scrub it
  * N times -- ideally, we should scrub it exactly once.
  *
  * We leverage the fact that the dde's replication class (enum ddt_class)
  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
  *
  * To prevent excess scrubbing, the scrub begins by walking the DDT
  * to find all blocks with refcnt > 1, and scrubs each of these once.
  * Since there are two replication classes which contain blocks with
  * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
  * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
  *
  * There would be nothing more to say if a block's refcnt couldn't change
  * during a scrub, but of course it can so we must account for changes
  * in a block's replication class.
  *
  * Here's an example of what can occur:
  *
  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
  * when visited during the top-down scrub phase, it will be scrubbed twice.
  * This negates our scrub optimization, but is otherwise harmless.
  *
  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
  * on each visit during the top-down scrub phase, it will never be scrubbed.
  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
  * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
  * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
  * while a scrub is in progress, it scrubs the block right then.
  */
 static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
 	ddt_entry_t dde = { 0 };
 	int error;
 	uint64_t n = 0;
 
 	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
 		ddt_t *ddt;
 
 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
 			break;
 		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
 		    (longlong_t)ddb->ddb_class,
 		    (longlong_t)ddb->ddb_type,
 		    (longlong_t)ddb->ddb_checksum,
 		    (longlong_t)ddb->ddb_cursor);
 
 		/* There should be no pending changes to the dedup table */
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
 
 		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
 		n++;
 
 		if (dsl_scan_check_pause(scn, NULL))
 			break;
 	}
 
 	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
 	    (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
 	    (int)scn->scn_pausing);
 
 	ASSERT(error == 0 || error == ENOENT);
 	ASSERT(error != ENOENT ||
 	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
 }
 
 /* ARGSUSED */
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
     ddt_entry_t *dde, dmu_tx_t *tx)
 {
 	const ddt_key_t *ddk = &dde->dde_key;
 	ddt_phys_t *ddp = dde->dde_phys;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
 		ddt_bp_create(checksum, ddk, ddp, &bp);
 
 		scn->scn_visited_this_txg++;
 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
 	}
 }
 
 static void
 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_ddt(scn, tx);
 		if (scn->scn_pausing)
 			return;
 	}
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
 		/* First do the MOS & ORIGIN */
 
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_visit_rootbp(scn, NULL,
 		    &dp->dp_meta_rootbp, tx);
 		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 		if (scn->scn_pausing)
 			return;
 
 		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 			    enqueue_cb, tx, DS_FIND_CHILDREN));
 		} else {
 			dsl_scan_visitds(scn,
 			    dp->dp_origin_snap->ds_object, tx);
 		}
 		ASSERT(!scn->scn_pausing);
 	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
 	    ZB_DESTROYED_OBJSET) {
 		/*
 		 * If we were paused, continue from here.  Note if the
 		 * ds we were paused on was deleted, the zb_objset may
 		 * be -1, so we will skip this and find a new objset
 		 * below.
 		 */
 		dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
 		if (scn->scn_pausing)
 			return;
 	}
 
 	/*
 	 * In case we were paused right at the end of the ds, zero the
 	 * bookmark so we don't think that we're still trying to resume.
 	 */
 	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
 
 	/* keep pulling things out of the zap-object-as-queue */
 	while (zap_cursor_init(&zc, dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj),
 	    zap_cursor_retrieve(&zc, &za) == 0) {
 		dsl_dataset_t *ds;
 		uint64_t dsobj;
 
 		dsobj = strtonum(za.za_name, NULL);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, dsobj, tx));
 
 		/* Set up min/max txg */
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 		if (za.za_first_integer != 0) {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg,
 			    za.za_first_integer);
 		} else {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg,
-			    ds->ds_phys->ds_prev_snap_txg);
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 		}
 		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
 		dsl_dataset_rele(ds, FTAG);
 
 		dsl_scan_visitds(scn, dsobj, tx);
 		zap_cursor_fini(&zc);
 		if (scn->scn_pausing)
 			return;
 	}
 	zap_cursor_fini(&zc);
 }
 
 static boolean_t
 dsl_scan_free_should_pause(dsl_scan_t *scn)
 {
 	uint64_t elapsed_nanosecs;
 
 	if (zfs_recover)
 		return (B_FALSE);
 
 	if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
 		return (B_TRUE);
 
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
 	    (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
 	    txg_sync_waiting(scn->scn_dp)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
 
 static int
 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = arg;
 
 	if (!scn->scn_is_bptree ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
 		if (dsl_scan_free_should_pause(scn))
 			return (SET_ERROR(ERESTART));
 	}
 
 	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
 	    dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 	scn->scn_visited_this_txg++;
 	return (0);
 }
 
 boolean_t
 dsl_scan_active(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	uint64_t used = 0, comp, uncomp;
 
 	if (spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 	if (scn->scn_phys.scn_state == DSS_SCANNING ||
 	    (scn->scn_async_destroying && !scn->scn_async_stalled))
 		return (B_TRUE);
 
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
 		    &used, &comp, &uncomp);
 	}
 	return (used != 0);
 }
 
 void
 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	int err = 0;
 
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
 	 * that we can restart an old-style scan while the pool is being
 	 * imported (see dsl_scan_init).
 	 */
 	if (scn->scn_restart_txg != 0 &&
 	    scn->scn_restart_txg <= tx->tx_txg) {
 		pool_scan_func_t func = POOL_SCAN_SCRUB;
 		dsl_scan_done(scn, B_FALSE, tx);
 		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
 			func = POOL_SCAN_RESILVER;
 		zfs_dbgmsg("restarting scan func=%u txg=%llu",
 		    func, tx->tx_txg);
 		dsl_scan_setup_sync(&func, tx);
 	}
 
 	/*
 	 * If the scan is inactive due to a stalled async destroy, try again.
 	 */
 	if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
 	    spa_sync_pass(dp->dp_spa) > 1)
 		return;
 
 	scn->scn_visited_this_txg = 0;
 	scn->scn_pausing = B_FALSE;
 	scn->scn_sync_start_time = gethrtime();
 	spa->spa_scrub_active = B_TRUE;
 
 	/*
 	 * First process the async destroys.  If we pause, don't do
 	 * any scrubbing or resilvering.  This ensures that there are no
 	 * async destroys while we are scanning, so the scan code doesn't
 	 * have to worry about traversing it.  It is also faster to free the
 	 * blocks than to scrub them.
 	 */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bpobj_iterate(&dp->dp_free_bpobj,
 		    dsl_scan_free_block_cb, scn, tx);
 		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
 
 		if (err != 0 && err != ERESTART)
 			zfs_panic_recover("error %u from bpobj_iterate()", err);
 	}
 
 	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		ASSERT(scn->scn_async_destroying);
 		scn->scn_is_bptree = B_TRUE;
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bptree_iterate(dp->dp_meta_objset,
 		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
 		VERIFY0(zio_wait(scn->scn_zio_root));
 
 		if (err == EIO || err == ECKSUM) {
 			err = 0;
 		} else if (err != 0 && err != ERESTART) {
 			zfs_panic_recover("error %u from "
 			    "traverse_dataset_destroyed()", err);
 		}
 
 		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
 			/* finished; deactivate async destroy feature */
 			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
 			ASSERT(!spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY));
 			VERIFY0(zap_remove(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_BPTREE_OBJ, tx));
 			VERIFY0(bptree_free(dp->dp_meta_objset,
 			    dp->dp_bptree_obj, tx));
 			dp->dp_bptree_obj = 0;
 			scn->scn_async_destroying = B_FALSE;
 			scn->scn_async_stalled = B_FALSE;
 		} else {
 			/*
 			 * If we didn't make progress, mark the async
 			 * destroy as stalled, so that we will not initiate
 			 * a spa_sync() on its behalf.  Note that we only
 			 * check this if we are not finished, because if the
 			 * bptree had no blocks for us to visit, we can
 			 * finish without "making progress".
 			 */
 			scn->scn_async_stalled =
 			    (scn->scn_visited_this_txg == 0);
 		}
 	}
 	if (scn->scn_visited_this_txg) {
 		zfs_dbgmsg("freed %llu blocks in %llums from "
 		    "free_bpobj/bptree txg %llu; err=%d",
 		    (longlong_t)scn->scn_visited_this_txg,
 		    (longlong_t)
 		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
 		    (longlong_t)tx->tx_txg, err);
 		scn->scn_visited_this_txg = 0;
 
 		/*
 		 * Write out changes to the DDT that may be required as a
 		 * result of the blocks freed.  This ensures that the DDT
 		 * is clean when a scrub/resilver runs.
 		 */
 		ddt_sync(spa, tx->tx_txg);
 	}
 	if (err != 0)
 		return;
 	if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
-	    (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
-	    dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
-	    dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
+	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
 		/*
 		 * We have finished background destroying, but there is still
 		 * some space left in the dp_free_dir. Transfer this leaked
 		 * space to the dp_leak_dir.
 		 */
 		if (dp->dp_leak_dir == NULL) {
 			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 			    LEAK_DIR_NAME, tx);
 			VERIFY0(dsl_pool_open_special_dir(dp,
 			    LEAK_DIR_NAME, &dp->dp_leak_dir));
 			rrw_exit(&dp->dp_config_rwlock, FTAG);
 		}
 		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
-		    dp->dp_free_dir->dd_phys->dd_used_bytes,
-		    dp->dp_free_dir->dd_phys->dd_compressed_bytes,
-		    dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+		    dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
-		    -dp->dp_free_dir->dd_phys->dd_used_bytes,
-		    -dp->dp_free_dir->dd_phys->dd_compressed_bytes,
-		    -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
 	}
 	if (!scn->scn_async_destroying) {
 		/* finished; verify that space accounting went to zero */
-		ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
-		ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
-		ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
 	}
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	if (scn->scn_done_txg == tx->tx_txg) {
 		ASSERT(!scn->scn_pausing);
 		/* finished with scan. */
 		zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
 		dsl_scan_done(scn, B_TRUE, tx);
 		ASSERT3U(spa->spa_scrub_inflight, ==, 0);
 		dsl_scan_sync_state(scn, tx);
 		return;
 	}
 
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		zfs_dbgmsg("doing scan sync txg %llu; "
 		    "ddt bm=%llu/%llu/%llu/%llx",
 		    (longlong_t)tx->tx_txg,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
 		ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
 		ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
 		ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
 		ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
 	} else {
 		zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
 		    (longlong_t)tx->tx_txg,
 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
 	}
 
 	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 	    NULL, ZIO_FLAG_CANFAIL);
 	dsl_pool_config_enter(dp, FTAG);
 	dsl_scan_visit(scn, tx);
 	dsl_pool_config_exit(dp, FTAG);
 	(void) zio_wait(scn->scn_zio_root);
 	scn->scn_zio_root = NULL;
 
 	zfs_dbgmsg("visited %llu blocks in %llums",
 	    (longlong_t)scn->scn_visited_this_txg,
 	    (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
 
 	if (!scn->scn_pausing) {
 		scn->scn_done_txg = tx->tx_txg + 1;
 		zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
 		    tx->tx_txg, scn->scn_done_txg);
 	}
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight > 0) {
 			cv_wait(&spa->spa_scrub_io_cv,
 			    &spa->spa_scrub_lock);
 		}
 		mutex_exit(&spa->spa_scrub_lock);
 	}
 
 	dsl_scan_sync_state(scn, tx);
 }
 
 /*
  * This will start a new scan, or restart an existing one.
  */
 void
 dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
 {
 	if (txg == 0) {
 		dmu_tx_t *tx;
 		tx = dmu_tx_create_dd(dp->dp_mos_dir);
 		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
 
 		txg = dmu_tx_get_txg(tx);
 		dp->dp_scan->scn_restart_txg = txg;
 		dmu_tx_commit(tx);
 	} else {
 		dp->dp_scan->scn_restart_txg = txg;
 	}
 	zfs_dbgmsg("restarting resilver txg=%llu", txg);
 }
 
 boolean_t
 dsl_scan_resilvering(dsl_pool_t *dp)
 {
 	return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
 	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
 }
 
 /*
  * scrub consumers
  */
 
 static void
 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
 {
 	int i;
 
 	/*
 	 * If we resume after a reboot, zab will be NULL; don't record
 	 * incomplete stats in that case.
 	 */
 	if (zab == NULL)
 		return;
 
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
 		if (t & DMU_OT_NEWTYPE)
 			t = DMU_OT_OTHER;
 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
 		int equal;
 
 		zb->zb_count++;
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1]))
 				zb->zb_ditto_2_of_2_samevdev++;
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal == 1)
 				zb->zb_ditto_2_of_3_samevdev++;
 			else if (equal == 3)
 				zb->zb_ditto_3_of_3_samevdev++;
 			break;
 		}
 	}
 }
 
 static void
 dsl_scan_scrub_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	zio_data_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 
 	if (zio->io_error && (zio->io_error != ECKSUM ||
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
 		spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static int
 dsl_scan_scrub_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	size_t size = BP_GET_PSIZE(bp);
 	spa_t *spa = dp->dp_spa;
 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
 	boolean_t needs_io;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 	unsigned int scan_delay = 0;
 
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg)
 		return (0);
 
 	count_block(dp->dp_blkstats, bp);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
 	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
 		zio_flags |= ZIO_FLAG_SCRUB;
 		needs_io = B_TRUE;
 		scan_delay = zfs_scrub_delay;
 	} else {
 		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
 		zio_flags |= ZIO_FLAG_RESILVER;
 		needs_io = B_FALSE;
 		scan_delay = zfs_resilver_delay;
 	}
 
 	/* If it's an intent log block, failure is expected. */
 	if (zb->zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
 		vdev_t *vd = vdev_lookup_top(spa,
 		    DVA_GET_VDEV(&bp->blk_dva[d]));
 
 		/*
 		 * Keep track of how much data we've examined so that
 		 * zpool(1M) status can make useful progress reports.
 		 */
 		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
 		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
 
 		/* if it's a resilver, this may not be in the target range */
 		if (!needs_io) {
 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
 				/*
 				 * Gang members may be spread across multiple
 				 * vdevs, so the best estimate we have is the
 				 * scrub range, which has already been checked.
 				 * XXX -- it would be better to change our
 				 * allocation policy to ensure that all
 				 * gang members reside on the same vdev.
 				 */
 				needs_io = B_TRUE;
 			} else {
 				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
 				    phys_birth, 1);
 			}
 		}
 	}
 
 	if (needs_io && !zfs_no_scrub_io) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		uint64_t maxinflight = rvd->vdev_children *
 		    MAX(zfs_top_maxinflight, 1);
 		void *data = zio_data_buf_alloc(size);
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight >= maxinflight)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight++;
 		mutex_exit(&spa->spa_scrub_lock);
 
 		/*
 		 * If we're seeing recent (zfs_scan_idle) "important" I/Os
 		 * then throttle our workload to limit the impact of a scan.
 		 */
 		if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
 			delay(MAX((int)scan_delay, 0));
 
 		zio_nowait(zio_read(NULL, spa, bp, data, size,
 		    dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
 		    zio_flags, zb));
 	}
 
 	/* do not relocate this block */
 	return (0);
 }
 
 int
 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 {
 	spa_t *spa = dp->dp_spa;
 
 	/*
 	 * Purge all vdev caches and probe all devices.  We do this here
 	 * rather than in sync context because this requires a writer lock
 	 * on the spa_config lock, which we can't do from sync context.  The
 	 * spa_scrub_reopen flag indicates that vdev_open() should not
 	 * attempt to start another scrub.
 	 */
 	spa_vdev_state_enter(spa, SCL_NONE);
 	spa->spa_scrub_reopen = B_TRUE;
 	vdev_reopen(spa->spa_root_vdev);
 	spa->spa_scrub_reopen = B_FALSE;
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
 	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c	(revision 275782)
@@ -1,186 +1,186 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/metaslab.h>
 
 #define	DST_AVG_BLKSHIFT 14
 
 /* ARGSUSED */
 static int
 dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
 {
 	return (0);
 }
 
 /*
  * Called from open context to perform a callback in syncing context.  Waits
  * for the operation to complete.
  *
  * The checkfunc will be called from open context as a preliminary check
  * which can quickly fail.  If it succeeds, it will be called again from
  * syncing context.  The checkfunc should generally be designed to work
  * properly in either context, but if necessary it can check
  * dmu_tx_is_syncing(tx).
  *
  * The synctask infrastructure enforces proper locking strategy with respect
  * to the dp_config_rwlock -- the lock will always be held when the callbacks
  * are called.  It will be held for read during the open-context (preliminary)
  * call to the checkfunc, and then held for write from syncing context during
  * the calls to the check and sync funcs.
  *
  * A dataset or pool name can be passed as the first argument.  Typically,
  * the check func will hold, check the return value of the hold, and then
  * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
  * This is safe because no changes can be made between the check and sync funcs,
  * and the sync func will only be called if the check func successfully opened
  * the dataset.
  */
 int
 dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
     dsl_syncfunc_t *syncfunc, void *arg,
     int blocks_modified, zfs_space_check_t space_check)
 {
 	spa_t *spa;
 	dmu_tx_t *tx;
 	int err;
 	dsl_sync_task_t dst = { 0 };
 	dsl_pool_t *dp;
 
 	err = spa_open(pool, &spa, FTAG);
 	if (err != 0)
 		return (err);
 	dp = spa_get_dsl(spa);
 
 top:
 	tx = dmu_tx_create_dd(dp->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 
 	dst.dst_pool = dp;
 	dst.dst_txg = dmu_tx_get_txg(tx);
 	dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
 	dst.dst_space_check = space_check;
 	dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
 	dst.dst_syncfunc = syncfunc;
 	dst.dst_arg = arg;
 	dst.dst_error = 0;
 	dst.dst_nowaiter = B_FALSE;
 
 	dsl_pool_config_enter(dp, FTAG);
 	err = dst.dst_checkfunc(arg, tx);
 	dsl_pool_config_exit(dp, FTAG);
 
 	if (err != 0) {
 		dmu_tx_commit(tx);
 		spa_close(spa, FTAG);
 		return (err);
 	}
 
 	VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg));
 
 	dmu_tx_commit(tx);
 
 	txg_wait_synced(dp, dst.dst_txg);
 
 	if (dst.dst_error == EAGAIN) {
 		txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE);
 		goto top;
 	}
 
 	spa_close(spa, FTAG);
 	return (dst.dst_error);
 }
 
 void
 dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
     int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
 {
 	dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
 
 	dst->dst_pool = dp;
 	dst->dst_txg = dmu_tx_get_txg(tx);
 	dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
 	dst->dst_space_check = space_check;
 	dst->dst_checkfunc = dsl_null_checkfunc;
 	dst->dst_syncfunc = syncfunc;
 	dst->dst_arg = arg;
 	dst->dst_error = 0;
 	dst->dst_nowaiter = B_TRUE;
 
 	VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg));
 }
 
 /*
  * Called in syncing context to execute the synctask.
  */
 void
 dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dst->dst_pool;
 
 	ASSERT0(dst->dst_error);
 
 	/*
 	 * Check for sufficient space.
 	 *
 	 * When the sync task was created, the caller specified the
 	 * type of space checking required.  See the comment in
 	 * zfs_space_check_t for details on the semantics of each
 	 * type of space checking.
 	 *
 	 * We just check against what's on-disk; we don't want any
 	 * in-flight accounting to get in our way, because open context
 	 * may have already used up various in-core limits
 	 * (arc_tempreserve, dsl_pool_tempreserve).
 	 */
 	if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
 		uint64_t quota = dsl_pool_adjustedsize(dp,
 		    dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) -
 		    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
-		uint64_t used = dp->dp_root_dir->dd_phys->dd_used_bytes;
+		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
 		/* MOS space is triple-dittoed, so we multiply by 3. */
 		if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
 			dst->dst_error = SET_ERROR(ENOSPC);
 			if (dst->dst_nowaiter)
 				kmem_free(dst, sizeof (*dst));
 			return;
 		}
 	}
 
 	/*
 	 * Check for errors by calling checkfunc.
 	 */
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 	dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx);
 	if (dst->dst_error == 0)
 		dst->dst_syncfunc(dst->dst_arg, tx);
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 	if (dst->dst_nowaiter)
 		kmem_free(dst, sizeof (*dst));
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c	(revision 275782)
@@ -1,665 +1,666 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_tx.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 
 typedef struct dsl_dataset_user_hold_arg {
 	nvlist_t *dduha_holds;
 	nvlist_t *dduha_chkholds;
 	nvlist_t *dduha_errlist;
 	minor_t dduha_minor;
 } dsl_dataset_user_hold_arg_t;
 
 /*
  * If you add new checks here, you may need to add additional checks to the
  * "temporary" case in snapshot_check() in dmu_objset.c.
  */
 int
 dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag,
     boolean_t temphold, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	int error = 0;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	if (strlen(htag) > MAXNAMELEN)
 		return (SET_ERROR(E2BIG));
 	/* Tempholds have a more restricted length */
 	if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
 		return (SET_ERROR(E2BIG));
 
 	/* tags must be unique (if ds already exists) */
-	if (ds != NULL && ds->ds_phys->ds_userrefs_obj != 0) {
+	if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
 		uint64_t value;
 
-		error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj,
+		error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
 		    htag, 8, 1, &value);
 		if (error == 0)
 			error = SET_ERROR(EEXIST);
 		else if (error == ENOENT)
 			error = 0;
 	}
 
 	return (error);
 }
 
 static int
 dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_user_hold_arg_t *dduha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS)
 		return (SET_ERROR(ENOTSUP));
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
 		dsl_dataset_t *ds;
 		int error = 0;
 		char *htag, *name;
 
 		/* must be a snapshot */
 		name = nvpair_name(pair);
 		if (strchr(name, '@') == NULL)
 			error = SET_ERROR(EINVAL);
 
 		if (error == 0)
 			error = nvpair_value_string(pair, &htag);
 
 		if (error == 0)
 			error = dsl_dataset_hold(dp, name, FTAG, &ds);
 
 		if (error == 0) {
 			error = dsl_dataset_user_hold_check_one(ds, htag,
 			    dduha->dduha_minor != 0, tx);
 			dsl_dataset_rele(ds, FTAG);
 		}
 
 		if (error == 0) {
 			fnvlist_add_string(dduha->dduha_chkholds, name, htag);
 		} else {
 			/*
 			 * We register ENOENT errors so they can be correctly
 			 * reported if needed, such as when all holds fail.
 			 */
 			fnvlist_add_int32(dduha->dduha_errlist, name, error);
 			if (error != ENOENT)
 				return (error);
 		}
 	}
 
 	return (0);
 }
 
 
 static void
 dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds,
     const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
-	if (ds->ds_phys->ds_userrefs_obj == 0) {
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
 		/*
 		 * This is the first user hold for this dataset.  Create
 		 * the userrefs zap object.
 		 */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		zapobj = ds->ds_phys->ds_userrefs_obj =
+		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
 		    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
 	} else {
-		zapobj = ds->ds_phys->ds_userrefs_obj;
+		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
 	}
 	ds->ds_userrefs++;
 
 	VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx));
 
 	if (minor != 0) {
 		char name[MAXNAMELEN];
 		nvlist_t *tags;
 
 		VERIFY0(dsl_pool_user_hold(dp, ds->ds_object,
 		    htag, now, tx));
 		(void) snprintf(name, sizeof (name), "%llx",
 		    (u_longlong_t)ds->ds_object);
 
 		if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) {
 			tags = fnvlist_alloc();
 			fnvlist_add_boolean(tags, htag);
 			fnvlist_add_nvlist(tmpholds, name, tags);
 			fnvlist_free(tags);
 		} else {
 			fnvlist_add_boolean(tags, htag);
 		}
 	}
 
 	spa_history_log_internal_ds(ds, "hold", tx,
 	    "tag=%s temp=%d refs=%llu",
 	    htag, minor != 0, ds->ds_userrefs);
 }
 
 typedef struct zfs_hold_cleanup_arg {
 	char zhca_spaname[MAXNAMELEN];
 	uint64_t zhca_spa_load_guid;
 	nvlist_t *zhca_holds;
 } zfs_hold_cleanup_arg_t;
 
 static void
 dsl_dataset_user_release_onexit(void *arg)
 {
 	zfs_hold_cleanup_arg_t *ca = arg;
 	spa_t *spa;
 	int error;
 
 	error = spa_open(ca->zhca_spaname, &spa, FTAG);
 	if (error != 0) {
 		zfs_dbgmsg("couldn't release holds on pool=%s "
 		    "because pool is no longer loaded",
 		    ca->zhca_spaname);
 		return;
 	}
 	if (spa_load_guid(spa) != ca->zhca_spa_load_guid) {
 		zfs_dbgmsg("couldn't release holds on pool=%s "
 		    "because pool is no longer loaded (guid doesn't match)",
 		    ca->zhca_spaname);
 		spa_close(spa, FTAG);
 		return;
 	}
 
 	(void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds);
 	fnvlist_free(ca->zhca_holds);
 	kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
 	spa_close(spa, FTAG);
 }
 
 static void
 dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor)
 {
 	zfs_hold_cleanup_arg_t *ca;
 
 	if (minor == 0 || nvlist_empty(holds)) {
 		fnvlist_free(holds);
 		return;
 	}
 
 	ASSERT(spa != NULL);
 	ca = kmem_alloc(sizeof (*ca), KM_SLEEP);
 
 	(void) strlcpy(ca->zhca_spaname, spa_name(spa),
 	    sizeof (ca->zhca_spaname));
 	ca->zhca_spa_load_guid = spa_load_guid(spa);
 	ca->zhca_holds = holds;
 	VERIFY0(zfs_onexit_add_cb(minor,
 	    dsl_dataset_user_release_onexit, ca, NULL));
 }
 
 void
 dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag,
     minor_t minor, uint64_t now, dmu_tx_t *tx)
 {
 	nvlist_t *tmpholds;
 
 	if (minor != 0)
 		tmpholds = fnvlist_alloc();
 	else
 		tmpholds = NULL;
 	dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx);
 	dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor);
 }
 
 static void
 dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_user_hold_arg_t *dduha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvlist_t *tmpholds;
 	uint64_t now = gethrestime_sec();
 
 	if (dduha->dduha_minor != 0)
 		tmpholds = fnvlist_alloc();
 	else
 		tmpholds = NULL;
 	for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) {
 		dsl_dataset_t *ds;
 
 		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
 		dsl_dataset_user_hold_sync_one_impl(tmpholds, ds,
 		    fnvpair_value_string(pair), dduha->dduha_minor, now, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor);
 }
 
 /*
  * The full semantics of this function are described in the comment above
  * lzc_hold().
  *
  * To summarize:
  * holds is nvl of snapname -> holdname
  * errlist will be filled in with snapname -> error
  *
  * The snaphosts must all be in the same pool.
  *
  * Holds for snapshots that don't exist will be skipped.
  *
  * If none of the snapshots for requested holds exist then ENOENT will be
  * returned.
  *
  * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned
  * up when the process exits.
  *
  * On success all the holds, for snapshots that existed, will be created and 0
  * will be returned.
  *
  * On failure no holds will be created, the errlist will be filled in,
  * and an errno will returned.
  *
  * In all cases the errlist will contain entries for holds where the snapshot
  * didn't exist.
  */
 int
 dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
 {
 	dsl_dataset_user_hold_arg_t dduha;
 	nvpair_t *pair;
 	int ret;
 
 	pair = nvlist_next_nvpair(holds, NULL);
 	if (pair == NULL)
 		return (0);
 
 	dduha.dduha_holds = holds;
 	dduha.dduha_chkholds = fnvlist_alloc();
 	dduha.dduha_errlist = errlist;
 	dduha.dduha_minor = cleanup_minor;
 
 	ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
 	    dsl_dataset_user_hold_sync, &dduha,
 	    fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
 	fnvlist_free(dduha.dduha_chkholds);
 
 	return (ret);
 }
 
 typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dataset_t **dsp);
 
 typedef struct dsl_dataset_user_release_arg {
 	dsl_holdfunc_t *ddura_holdfunc;
 	nvlist_t *ddura_holds;
 	nvlist_t *ddura_todelete;
 	nvlist_t *ddura_errlist;
 	nvlist_t *ddura_chkholds;
 } dsl_dataset_user_release_arg_t;
 
 /* Place a dataset hold on the snapshot identified by passed dsobj string */
 static int
 dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag,
     dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_hold_obj(dp, strtonum(dsobj, NULL), tag, dsp));
 }
 
 static int
 dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
     dsl_dataset_t *ds, nvlist_t *holds, const char *snapname)
 {
 	uint64_t zapobj;
 	nvlist_t *holds_found;
 	objset_t *mos;
 	int numholds;
 
 	if (!dsl_dataset_is_snapshot(ds))
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_empty(holds))
 		return (0);
 
 	numholds = 0;
 	mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	zapobj = ds->ds_phys->ds_userrefs_obj;
+	zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
 	holds_found = fnvlist_alloc();
 
 	for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(holds, pair)) {
 		uint64_t tmp;
 		int error;
 		const char *holdname = nvpair_name(pair);
 
 		if (zapobj != 0)
 			error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp);
 		else
 			error = SET_ERROR(ENOENT);
 
 		/*
 		 * Non-existent holds are put on the errlist, but don't
 		 * cause an overall failure.
 		 */
 		if (error == ENOENT) {
 			if (ddura->ddura_errlist != NULL) {
 				char *errtag = kmem_asprintf("%s#%s",
 				    snapname, holdname);
 				fnvlist_add_int32(ddura->ddura_errlist, errtag,
 				    ENOENT);
 				strfree(errtag);
 			}
 			continue;
 		}
 
 		if (error != 0) {
 			fnvlist_free(holds_found);
 			return (error);
 		}
 
 		fnvlist_add_boolean(holds_found, holdname);
 		numholds++;
 	}
 
-	if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 &&
+	if (DS_IS_DEFER_DESTROY(ds) &&
+	    dsl_dataset_phys(ds)->ds_num_children == 1 &&
 	    ds->ds_userrefs == numholds) {
 		/* we need to destroy the snapshot as well */
 		if (dsl_dataset_long_held(ds)) {
 			fnvlist_free(holds_found);
 			return (SET_ERROR(EBUSY));
 		}
 		fnvlist_add_boolean(ddura->ddura_todelete, snapname);
 	}
 
 	if (numholds != 0) {
 		fnvlist_add_nvlist(ddura->ddura_chkholds, snapname,
 		    holds_found);
 	}
 	fnvlist_free(holds_found);
 
 	return (0);
 }
 
 static int
 dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_user_release_arg_t *ddura;
 	dsl_holdfunc_t *holdfunc;
 	dsl_pool_t *dp;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	dp = dmu_tx_pool(tx);
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	ddura = arg;
 	holdfunc = ddura->ddura_holdfunc;
 
 	for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) {
 		int error;
 		dsl_dataset_t *ds;
 		nvlist_t *holds;
 		const char *snapname = nvpair_name(pair);
 
 		error = nvpair_value_nvlist(pair, &holds);
 		if (error != 0)
 			error = (SET_ERROR(EINVAL));
 		else
 			error = holdfunc(dp, snapname, FTAG, &ds);
 		if (error == 0) {
 			error = dsl_dataset_user_release_check_one(ddura, ds,
 			    holds, snapname);
 			dsl_dataset_rele(ds, FTAG);
 		}
 		if (error != 0) {
 			if (ddura->ddura_errlist != NULL) {
 				fnvlist_add_int32(ddura->ddura_errlist,
 				    snapname, error);
 			}
 			/*
 			 * Non-existent snapshots are put on the errlist,
 			 * but don't cause an overall failure.
 			 */
 			if (error != ENOENT)
 				return (error);
 		}
 	}
 
 	return (0);
 }
 
 static void
 dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds,
     dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 
 	for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(holds, pair)) {
 		int error;
 		const char *holdname = nvpair_name(pair);
 
 		/* Remove temporary hold if one exists. */
 		error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
 		VERIFY(error == 0 || error == ENOENT);
 
-		VERIFY0(zap_remove(mos, ds->ds_phys->ds_userrefs_obj, holdname,
-		    tx));
+		VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    holdname, tx));
 		ds->ds_userrefs--;
 
 		spa_history_log_internal_ds(ds, "release", tx,
 		    "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs);
 	}
 }
 
 static void
 dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_user_release_arg_t *ddura = arg;
 	dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds,
 	    pair)) {
 		dsl_dataset_t *ds;
 		const char *name = nvpair_name(pair);
 
 		VERIFY0(holdfunc(dp, name, FTAG, &ds));
 
 		dsl_dataset_user_release_sync_one(ds,
 		    fnvpair_value_nvlist(pair), tx);
 		if (nvlist_exists(ddura->ddura_todelete, name)) {
 			ASSERT(ds->ds_userrefs == 0 &&
-			    ds->ds_phys->ds_num_children == 1 &&
+			    dsl_dataset_phys(ds)->ds_num_children == 1 &&
 			    DS_IS_DEFER_DESTROY(ds));
 			dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
 		}
 		dsl_dataset_rele(ds, FTAG);
 	}
 }
 
 /*
  * The full semantics of this function are described in the comment above
  * lzc_release().
  *
  * To summarize:
  * Releases holds specified in the nvl holds.
  *
  * holds is nvl of snapname -> { holdname, ... }
  * errlist will be filled in with snapname -> error
  *
  * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots,
  * otherwise they should be the names of shapshots.
  *
  * As a release may cause snapshots to be destroyed this trys to ensure they
  * aren't mounted.
  *
  * The release of non-existent holds are skipped.
  *
  * At least one hold must have been released for the this function to succeed
  * and return 0.
  */
 static int
 dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
     dsl_pool_t *tmpdp)
 {
 	dsl_dataset_user_release_arg_t ddura;
 	nvpair_t *pair;
 	char *pool;
 	int error;
 
 	pair = nvlist_next_nvpair(holds, NULL);
 	if (pair == NULL)
 		return (0);
 
 	/*
 	 * The release may cause snapshots to be destroyed; make sure they
 	 * are not mounted.
 	 */
 	if (tmpdp != NULL) {
 		/* Temporary holds are specified by dsobj string. */
 		ddura.ddura_holdfunc = dsl_dataset_hold_obj_string;
 		pool = spa_name(tmpdp->dp_spa);
 #ifdef _KERNEL
 		for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(holds, pair)) {
 			dsl_dataset_t *ds;
 
 			dsl_pool_config_enter(tmpdp, FTAG);
 			error = dsl_dataset_hold_obj_string(tmpdp,
 			    nvpair_name(pair), FTAG, &ds);
 			if (error == 0) {
 				char name[MAXNAMELEN];
 				dsl_dataset_name(ds, name);
 				dsl_pool_config_exit(tmpdp, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				(void) zfs_unmount_snap(name);
 			} else {
 				dsl_pool_config_exit(tmpdp, FTAG);
 			}
 		}
 #endif
 	} else {
 		/* Non-temporary holds are specified by name. */
 		ddura.ddura_holdfunc = dsl_dataset_hold;
 		pool = nvpair_name(pair);
 #ifdef _KERNEL
 		for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(holds, pair)) {
 			(void) zfs_unmount_snap(nvpair_name(pair));
 		}
 #endif
 	}
 
 	ddura.ddura_holds = holds;
 	ddura.ddura_errlist = errlist;
 	ddura.ddura_todelete = fnvlist_alloc();
 	ddura.ddura_chkholds = fnvlist_alloc();
 
 	error = dsl_sync_task(pool, dsl_dataset_user_release_check,
 	    dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE);
 	fnvlist_free(ddura.ddura_todelete);
 	fnvlist_free(ddura.ddura_chkholds);
 
 	return (error);
 }
 
 /*
  * holds is nvl of snapname -> { holdname, ... }
  * errlist will be filled in with snapname -> error
  */
 int
 dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist)
 {
 	return (dsl_dataset_user_release_impl(holds, errlist, NULL));
 }
 
 /*
  * holds is nvl of snapdsobj -> { holdname, ... }
  */
 void
 dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds)
 {
 	ASSERT(dp != NULL);
 	(void) dsl_dataset_user_release_impl(holds, NULL, dp);
 }
 
 int
 dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_pool_hold(dsname, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
-	if (ds->ds_phys->ds_userrefs_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
 		zap_attribute_t *za;
 		zap_cursor_t zc;
 
 		za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 		for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
-		    ds->ds_phys->ds_userrefs_obj);
+		    dsl_dataset_phys(ds)->ds_userrefs_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			fnvlist_add_uint64(nvl, za->za_name,
 			    za->za_first_integer);
 		}
 		zap_cursor_fini(&zc);
 		kmem_free(za, sizeof (zap_attribute_t));
 	}
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	(revision 275782)
@@ -1,2007 +1,2006 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 iXsystems, Inc
  * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sunddi.h>
 #include <sys/sa_impl.h>
 #include <sys/dnode.h>
 #include <sys/errno.h>
 #include <sys/zfs_context.h>
 
 /*
  * ZFS System attributes:
  *
  * A generic mechanism to allow for arbitrary attributes
  * to be stored in a dnode.  The data will be stored in the bonus buffer of
  * the dnode and if necessary a special "spill" block will be used to handle
  * overflow situations.  The spill block will be sized to fit the data
  * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
  * spill block is stored at the end of the current bonus buffer.  Any
  * attributes that would be in the way of the blkptr_t will be relocated
  * into the spill block.
  *
  * Attribute registration:
  *
  * Stored persistently on a per dataset basis
  * a mapping between attribute "string" names and their actual attribute
  * numeric values, length, and byteswap function.  The names are only used
  * during registration.  All  attributes are known by their unique attribute
  * id value.  If an attribute can have a variable size then the value
  * 0 will be used to indicate this.
  *
  * Attribute Layout:
  *
  * Attribute layouts are a way to compactly store multiple attributes, but
  * without taking the overhead associated with managing each attribute
  * individually.  Since you will typically have the same set of attributes
  * stored in the same order a single table will be used to represent that
  * layout.  The ZPL for example will usually have only about 10 different
  * layouts (regular files, device files, symlinks,
  * regular files + scanstamp, files/dir with extended attributes, and then
  * you have the possibility of all of those minus ACL, because it would
  * be kicked out into the spill block)
  *
  * Layouts are simply an array of the attributes and their
  * ordering i.e. [0, 1, 4, 5, 2]
  *
  * Each distinct layout is given a unique layout number and that is whats
  * stored in the header at the beginning of the SA data buffer.
  *
  * A layout only covers a single dbuf (bonus or spill).  If a set of
  * attributes is split up between the bonus buffer and a spill buffer then
  * two different layouts will be used.  This allows us to byteswap the
  * spill without looking at the bonus buffer and keeps the on disk format of
  * the bonus and spill buffer the same.
  *
  * Adding a single attribute will cause the entire set of attributes to
  * be rewritten and could result in a new layout number being constructed
  * as part of the rewrite if no such layout exists for the new set of
  * attribues.  The new attribute will be appended to the end of the already
  * existing attributes.
  *
  * Both the attribute registration and attribute layout information are
  * stored in normal ZAP attributes.  Their should be a small number of
  * known layouts and the set of attributes is assumed to typically be quite
  * small.
  *
  * The registered attributes and layout "table" information is maintained
  * in core and a special "sa_os_t" is attached to the objset_t.
  *
  * A special interface is provided to allow for quickly applying
  * a large set of attributes at once.  sa_replace_all_by_template() is
  * used to set an array of attributes.  This is used by the ZPL when
  * creating a brand new file.  The template that is passed into the function
  * specifies the attribute, size for variable length attributes, location of
  * data and special "data locator" function if the data isn't in a contiguous
  * location.
  *
  * Byteswap implications:
  *
  * Since the SA attributes are not entirely self describing we can't do
  * the normal byteswap processing.  The special ZAP layout attribute and
  * attribute registration attributes define the byteswap function and the
  * size of the attributes, unless it is variable sized.
  * The normal ZFS byteswapping infrastructure assumes you don't need
  * to read any objects in order to do the necessary byteswapping.  Whereas
  * SA attributes can only be properly byteswapped if the dataset is opened
  * and the layout/attribute ZAP attributes are available.  Because of this
  * the SA attributes will be byteswapped when they are first accessed by
  * the SA code that will read the SA data.
  */
 
 typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
     uint16_t length, int length_idx, boolean_t, void *userp);
 
 static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
 static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
 static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
     void *data);
 static void sa_idx_tab_rele(objset_t *os, void *arg);
 static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
     int buflen);
 static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx);
 
 arc_byteswap_func_t *sa_bswap_table[] = {
 	byteswap_uint64_array,
 	byteswap_uint32_array,
 	byteswap_uint16_array,
 	byteswap_uint8_array,
 	zfs_acl_byteswap,
 };
 
 #define	SA_COPY_DATA(f, s, t, l) \
 	{ \
 		if (f == NULL) { \
 			if (l == 8) { \
 				*(uint64_t *)t = *(uint64_t *)s; \
 			} else if (l == 16) { \
 				*(uint64_t *)t = *(uint64_t *)s; \
 				*(uint64_t *)((uintptr_t)t + 8) = \
 				    *(uint64_t *)((uintptr_t)s + 8); \
 			} else { \
 				bcopy(s, t, l); \
 			} \
 		} else \
 			sa_copy_data(f, s, t, l); \
 	}
 
 /*
  * This table is fixed and cannot be changed.  Its purpose is to
  * allow the SA code to work with both old/new ZPL file systems.
  * It contains the list of legacy attributes.  These attributes aren't
  * stored in the "attribute" registry zap objects, since older ZPL file systems
  * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
  * use this static table.
  */
 sa_attr_reg_t sa_legacy_attrs[] = {
 	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
 	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
 	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
 	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
 	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
 	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
 	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
 	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
 	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
 	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
 	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
 	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
 	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
 	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
 	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
 	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
 };
 
 /*
  * This is only used for objects of type DMU_OT_ZNODE
  */
 sa_attr_type_t sa_legacy_zpl_layout[] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
 /*
  * Special dummy layout used for buffers with no attributes.
  */
 sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
 
 static int sa_legacy_attr_count = 16;
 static kmem_cache_t *sa_cache = NULL;
 
 /*ARGSUSED*/
 static int
 sa_cache_constructor(void *buf, void *unused, int kmflag)
 {
 	sa_handle_t *hdl = buf;
 
 	hdl->sa_bonus_tab = NULL;
 	hdl->sa_spill_tab = NULL;
 	hdl->sa_os = NULL;
 	hdl->sa_userp = NULL;
 	hdl->sa_bonus = NULL;
 	hdl->sa_spill = NULL;
 	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 sa_cache_destructor(void *buf, void *unused)
 {
 	sa_handle_t *hdl = buf;
 	mutex_destroy(&hdl->sa_lock);
 }
 
 void
 sa_cache_init(void)
 {
 	sa_cache = kmem_cache_create("sa_cache",
 	    sizeof (sa_handle_t), 0, sa_cache_constructor,
 	    sa_cache_destructor, NULL, NULL, NULL, 0);
 }
 
 void
 sa_cache_fini(void)
 {
 	if (sa_cache)
 		kmem_cache_destroy(sa_cache);
 }
 
 static int
 layout_num_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = arg1;
 	const sa_lot_t *node2 = arg2;
 
 	if (node1->lot_num > node2->lot_num)
 		return (1);
 	else if (node1->lot_num < node2->lot_num)
 		return (-1);
 	return (0);
 }
 
 static int
 layout_hash_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = arg1;
 	const sa_lot_t *node2 = arg2;
 
 	if (node1->lot_hash > node2->lot_hash)
 		return (1);
 	if (node1->lot_hash < node2->lot_hash)
 		return (-1);
 	if (node1->lot_instance > node2->lot_instance)
 		return (1);
 	if (node1->lot_instance < node2->lot_instance)
 		return (-1);
 	return (0);
 }
 
 boolean_t
 sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
 {
 	int i;
 
 	if (count != tbf->lot_attr_count)
 		return (1);
 
 	for (i = 0; i != count; i++) {
 		if (attrs[i] != tbf->lot_attrs[i])
 			return (1);
 	}
 	return (0);
 }
 
 #define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
 
 static uint64_t
 sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
 {
 	int i;
 	uint64_t crc = -1ULL;
 
 	for (i = 0; i != attr_count; i++)
 		crc ^= SA_ATTR_HASH(attrs[i]);
 
 	return (crc);
 }
 
 static int
 sa_get_spill(sa_handle_t *hdl)
 {
 	int rc;
 	if (hdl->sa_spill == NULL) {
 		if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
 		    &hdl->sa_spill)) == 0)
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 	} else {
 		rc = 0;
 	}
 
 	return (rc);
 }
 
 /*
  * Main attribute lookup/update function
  * returns 0 for success or non zero for failures
  *
  * Operates on bulk array, first failure will abort further processing
  */
 int
 sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     sa_data_op_t data_op, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	int i;
 	int error = 0;
 	sa_buf_type_t buftypes;
 
 	buftypes = 0;
 
 	ASSERT(count > 0);
 	for (i = 0; i != count; i++) {
 		ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
 
 		bulk[i].sa_addr = NULL;
 		/* First check the bonus buffer */
 
 		if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
 		    hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
 			SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
 			    SA_GET_HDR(hdl, SA_BONUS),
 			    bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
 			if (tx && !(buftypes & SA_BONUS)) {
 				dmu_buf_will_dirty(hdl->sa_bonus, tx);
 				buftypes |= SA_BONUS;
 			}
 		}
 		if (bulk[i].sa_addr == NULL &&
 		    ((error = sa_get_spill(hdl)) == 0)) {
 			if (TOC_ATTR_PRESENT(
 			    hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
 				SA_ATTR_INFO(sa, hdl->sa_spill_tab,
 				    SA_GET_HDR(hdl, SA_SPILL),
 				    bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
 				if (tx && !(buftypes & SA_SPILL) &&
 				    bulk[i].sa_size == bulk[i].sa_length) {
 					dmu_buf_will_dirty(hdl->sa_spill, tx);
 					buftypes |= SA_SPILL;
 				}
 			}
 		}
 		if (error && error != ENOENT) {
 			return ((error == ECKSUM) ? EIO : error);
 		}
 
 		switch (data_op) {
 		case SA_LOOKUP:
 			if (bulk[i].sa_addr == NULL)
 				return (SET_ERROR(ENOENT));
 			if (bulk[i].sa_data) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_addr, bulk[i].sa_data,
 				    bulk[i].sa_size);
 			}
 			continue;
 
 		case SA_UPDATE:
 			/* existing rewrite of attr */
 			if (bulk[i].sa_addr &&
 			    bulk[i].sa_size == bulk[i].sa_length) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_addr,
 				    bulk[i].sa_length);
 				continue;
 			} else if (bulk[i].sa_addr) { /* attr size change */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_REPLACE, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			} else { /* adding new attribute */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_ADD, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			}
 			if (error)
 				return (error);
 			break;
 		}
 	}
 	return (error);
 }
 
 static sa_lot_t *
 sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
     uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, *findtb;
 	int i;
 	avl_index_t loc;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
 	tb->lot_attr_count = attr_count;
 	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
 	tb->lot_num = lot_num;
 	tb->lot_hash = hash;
 	tb->lot_instance = 0;
 
 	if (zapadd) {
 		char attr_name[8];
 
 		if (sa->sa_layout_attr_obj == 0) {
 			sa->sa_layout_attr_obj = zap_create_link(os,
 			    DMU_OT_SA_ATTR_LAYOUTS,
 			    sa->sa_master_obj, SA_LAYOUTS, tx);
 		}
 
 		(void) snprintf(attr_name, sizeof (attr_name),
 		    "%d", (int)lot_num);
 		VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
 		    attr_name, 2, attr_count, attrs, tx));
 	}
 
 	list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
 	    offsetof(sa_idx_tab_t, sa_next));
 
 	for (i = 0; i != attr_count; i++) {
 		if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
 			tb->lot_var_sizes++;
 	}
 
 	avl_add(&sa->sa_layout_num_tree, tb);
 
 	/* verify we don't have a hash collision */
 	if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
 		for (; findtb && findtb->lot_hash == hash;
 		    findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
 			if (findtb->lot_instance != tb->lot_instance)
 				break;
 			tb->lot_instance++;
 		}
 	}
 	avl_add(&sa->sa_layout_hash_tree, tb);
 	return (tb);
 }
 
 static void
 sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
     int count, dmu_tx_t *tx, sa_lot_t **lot)
 {
 	sa_lot_t *tb, tbsearch;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	boolean_t found = B_FALSE;
 
 	mutex_enter(&sa->sa_lock);
 	tbsearch.lot_hash = hash;
 	tbsearch.lot_instance = 0;
 	tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
 	if (tb) {
 		for (; tb && tb->lot_hash == hash;
 		    tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
 			if (sa_layout_equal(tb, attrs, count) == 0) {
 				found = B_TRUE;
 				break;
 			}
 		}
 	}
 	if (!found) {
 		tb = sa_add_layout_entry(os, attrs, count,
 		    avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
 	}
 	mutex_exit(&sa->sa_lock);
 	*lot = tb;
 }
 
 static int
 sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
 {
 	int error;
 	uint32_t blocksize;
 
 	if (size == 0) {
 		blocksize = SPA_MINBLOCKSIZE;
 	} else if (size > SPA_OLD_MAXBLOCKSIZE) {
 		ASSERT(0);
 		return (SET_ERROR(EFBIG));
 	} else {
 		blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
 	}
 
 	error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
 	ASSERT(error == 0);
 	return (error);
 }
 
 static void
 sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
 {
 	if (func == NULL) {
 		bcopy(datastart, target, buflen);
 	} else {
 		boolean_t start;
 		int bytes;
 		void *dataptr;
 		void *saptr = target;
 		uint32_t length;
 
 		start = B_TRUE;
 		bytes = 0;
 		while (bytes < buflen) {
 			func(&dataptr, &length, buflen, start, datastart);
 			bcopy(dataptr, saptr, length);
 			saptr = (void *)((caddr_t)saptr + length);
 			bytes += length;
 			start = B_FALSE;
 		}
 	}
 }
 
 /*
  * Determine several different sizes
  * first the sa header size
  * the number of bytes to be stored
  * if spill would occur the index in the attribute array is returned
  *
  * the boolean will_spill will be set when spilling is necessary.  It
  * is only set when the buftype is SA_BONUS
  */
 static int
 sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
     boolean_t *will_spill)
 {
 	int var_size = 0;
 	int i;
 	int j = -1;
 	int full_space;
 	int hdrsize;
 	boolean_t done = B_FALSE;
 
 	if (buftype == SA_BONUS && sa->sa_force_spill) {
 		*total = 0;
 		*index = 0;
 		*will_spill = B_TRUE;
 		return (0);
 	}
 
 	*index = -1;
 	*total = 0;
 
 	if (buftype == SA_BONUS)
 		*will_spill = B_FALSE;
 
 	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
 	    sizeof (sa_hdr_phys_t);
 
 	full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
 	ASSERT(IS_P2ALIGNED(full_space, 8));
 
 	for (i = 0; i != attr_count; i++) {
 		boolean_t is_var_sz;
 
 		*total = P2ROUNDUP(*total, 8);
 		*total += attr_desc[i].sa_length;
 		if (done)
 			goto next;
 
 		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
 		if (is_var_sz) {
 			var_size++;
 		}
 
 		if (is_var_sz && var_size > 1) {
 			if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
 			    *total < full_space) {
 				/*
 				 * Account for header space used by array of
 				 * optional sizes of variable-length attributes.
 				 * Record the index in case this increase needs
 				 * to be reversed due to spill-over.
 				 */
 				hdrsize += sizeof (uint16_t);
 				j = i;
 			} else {
 				done = B_TRUE;
 				*index = i;
 				if (buftype == SA_BONUS)
 					*will_spill = B_TRUE;
 				continue;
 			}
 		}
 
 		/*
 		 * find index of where spill *could* occur.
 		 * Then continue to count of remainder attribute
 		 * space.  The sum is used later for sizing bonus
 		 * and spill buffer.
 		 */
 		if (buftype == SA_BONUS && *index == -1 &&
 		    (*total + P2ROUNDUP(hdrsize, 8)) >
 		    (full_space - sizeof (blkptr_t))) {
 			*index = i;
 			done = B_TRUE;
 		}
 
 next:
 		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
 		    buftype == SA_BONUS)
 			*will_spill = B_TRUE;
 	}
 
 	/*
 	 * j holds the index of the last variable-sized attribute for
 	 * which hdrsize was increased.  Reverse the increase if that
 	 * attribute will be relocated to the spill block.
 	 */
 	if (*will_spill && j == *index)
 		hdrsize -= sizeof (uint16_t);
 
 	hdrsize = P2ROUNDUP(hdrsize, 8);
 	return (hdrsize);
 }
 
 #define	BUF_SPACE_NEEDED(total, header) (total + header)
 
 /*
  * Find layout that corresponds to ordering of attributes
  * If not found a new layout number is created and added to
  * persistent layout tables.
  */
 static int
 sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	uint64_t hash;
 	sa_buf_type_t buftype;
 	sa_hdr_phys_t *sahdr;
 	void *data_start;
 	int buf_space;
 	sa_attr_type_t *attrs, *attrs_start;
 	int i, lot_count;
 	int hdrsize;
 	int spillhdrsize = 0;
 	int used;
 	dmu_object_type_t bonustype;
 	sa_lot_t *lot;
 	int len_idx;
 	int spill_used;
 	boolean_t spilling;
 
 	dmu_buf_will_dirty(hdl->sa_bonus, tx);
 	bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
 
 	/* first determine bonus header size and sum of all attributes */
 	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
 	    SA_BONUS, &i, &used, &spilling);
 
 	if (used > SPA_OLD_MAXBLOCKSIZE)
 		return (SET_ERROR(EFBIG));
 
 	VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
 	    MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
 	    used + hdrsize, tx));
 
 	ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
 	    bonustype == DMU_OT_SA);
 
 	/* setup and size spill buffer when needed */
 	if (spilling) {
 		boolean_t dummy;
 
 		if (hdl->sa_spill == NULL) {
 			VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
 			    &hdl->sa_spill) == 0);
 		}
 		dmu_buf_will_dirty(hdl->sa_spill, tx);
 
 		spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
 		    attr_count - i, hdl->sa_spill, SA_SPILL, &i,
 		    &spill_used, &dummy);
 
 		if (spill_used > SPA_OLD_MAXBLOCKSIZE)
 			return (SET_ERROR(EFBIG));
 
 		buf_space = hdl->sa_spill->db_size - spillhdrsize;
 		if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
 		    hdl->sa_spill->db_size)
 			VERIFY(0 == sa_resize_spill(hdl,
 			    BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
 	}
 
 	/* setup starting pointers to lay down data */
 	data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
 	sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
 	buftype = SA_BONUS;
 
 	if (spilling)
 		buf_space = (sa->sa_force_spill) ?
 		    0 : SA_BLKPTR_SPACE - hdrsize;
 	else
 		buf_space = hdl->sa_bonus->db_size - hdrsize;
 
 	attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	lot_count = 0;
 
 	for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
 		uint16_t length;
 
 		ASSERT(IS_P2ALIGNED(data_start, 8));
 		ASSERT(IS_P2ALIGNED(buf_space, 8));
 		attrs[i] = attr_desc[i].sa_attr;
 		length = SA_REGISTERED_LEN(sa, attrs[i]);
 		if (length == 0)
 			length = attr_desc[i].sa_length;
 		else
 			VERIFY(length == attr_desc[i].sa_length);
 
 		if (buf_space < length) {  /* switch to spill buffer */
 			VERIFY(spilling);
 			VERIFY(bonustype == DMU_OT_SA);
 			if (buftype == SA_BONUS && !sa->sa_force_spill) {
 				sa_find_layout(hdl->sa_os, hash, attrs_start,
 				    lot_count, tx, &lot);
 				SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
 			}
 
 			buftype = SA_SPILL;
 			hash = -1ULL;
 			len_idx = 0;
 
 			sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
 			sahdr->sa_magic = SA_MAGIC;
 			data_start = (void *)((uintptr_t)sahdr +
 			    spillhdrsize);
 			attrs_start = &attrs[i];
 			buf_space = hdl->sa_spill->db_size - spillhdrsize;
 			lot_count = 0;
 		}
 		hash ^= SA_ATTR_HASH(attrs[i]);
 		attr_desc[i].sa_addr = data_start;
 		attr_desc[i].sa_size = length;
 		SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
 		    data_start, length);
 		if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
 			sahdr->sa_lengths[len_idx++] = length;
 		}
 		VERIFY((uintptr_t)data_start % 8 == 0);
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    length), 8);
 		buf_space -= P2ROUNDUP(length, 8);
 		lot_count++;
 	}
 
 	sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
 
 	/*
 	 * Verify that old znodes always have layout number 0.
 	 * Must be DMU_OT_SA for arbitrary layouts
 	 */
 	VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
 	    (bonustype == DMU_OT_SA && lot->lot_num > 1));
 
 	if (bonustype == DMU_OT_SA) {
 		SA_SET_HDR(sahdr, lot->lot_num,
 		    buftype == SA_BONUS ? hdrsize : spillhdrsize);
 	}
 
 	kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
 	if (hdl->sa_bonus_tab) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 		hdl->sa_bonus_tab = NULL;
 	}
 	if (!sa->sa_force_spill)
 		VERIFY(0 == sa_build_index(hdl, SA_BONUS));
 	if (hdl->sa_spill) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 		if (!spilling) {
 			/*
 			 * remove spill block that is no longer needed.
 			 */
 			dmu_buf_rele(hdl->sa_spill, NULL);
 			hdl->sa_spill = NULL;
 			hdl->sa_spill_tab = NULL;
 			VERIFY(0 == dmu_rm_spill(hdl->sa_os,
 			    sa_handle_object(hdl), tx));
 		} else {
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 		}
 	}
 
 	return (0);
 }
 
 static void
 sa_free_attr_table(sa_os_t *sa)
 {
 	int i;
 
 	if (sa->sa_attr_table == NULL)
 		return;
 
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_name)
 			kmem_free(sa->sa_attr_table[i].sa_name,
 			    strlen(sa->sa_attr_table[i].sa_name) + 1);
 	}
 
 	kmem_free(sa->sa_attr_table,
 	    sizeof (sa_attr_table_t) * sa->sa_num_attrs);
 
 	sa->sa_attr_table = NULL;
 }
 
 static int
 sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
 {
 	sa_os_t *sa = os->os_sa;
 	uint64_t sa_attr_count = 0;
 	uint64_t sa_reg_count = 0;
 	int error = 0;
 	uint64_t attr_value;
 	sa_attr_table_t *tb;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int registered_count = 0;
 	int i;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 
 	sa->sa_user_table =
 	    kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
 	sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
 
 	if (sa->sa_reg_attr_obj != 0) {
 		error = zap_count(os, sa->sa_reg_attr_obj,
 		    &sa_attr_count);
 
 		/*
 		 * Make sure we retrieved a count and that it isn't zero
 		 */
 		if (error || (error == 0 && sa_attr_count == 0)) {
 			if (error == 0)
 				error = SET_ERROR(EINVAL);
 			goto bail;
 		}
 		sa_reg_count = sa_attr_count;
 	}
 
 	if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
 		sa_attr_count += sa_legacy_attr_count;
 
 	/* Allocate attribute numbers for attributes that aren't registered */
 	for (i = 0; i != count; i++) {
 		boolean_t found = B_FALSE;
 		int j;
 
 		if (ostype == DMU_OST_ZFS) {
 			for (j = 0; j != sa_legacy_attr_count; j++) {
 				if (strcmp(reg_attrs[i].sa_name,
 				    sa_legacy_attrs[j].sa_name) == 0) {
 					sa->sa_user_table[i] =
 					    sa_legacy_attrs[j].sa_attr;
 					found = B_TRUE;
 				}
 			}
 		}
 		if (found)
 			continue;
 
 		if (sa->sa_reg_attr_obj)
 			error = zap_lookup(os, sa->sa_reg_attr_obj,
 			    reg_attrs[i].sa_name, 8, 1, &attr_value);
 		else
 			error = SET_ERROR(ENOENT);
 		switch (error) {
 		case ENOENT:
 			sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
 			sa_attr_count++;
 			break;
 		case 0:
 			sa->sa_user_table[i] = ATTR_NUM(attr_value);
 			break;
 		default:
 			goto bail;
 		}
 	}
 
 	sa->sa_num_attrs = sa_attr_count;
 	tb = sa->sa_attr_table =
 	    kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
 
 	/*
 	 * Attribute table is constructed from requested attribute list,
 	 * previously foreign registered attributes, and also the legacy
 	 * ZPL set of attributes.
 	 */
 
 	if (sa->sa_reg_attr_obj) {
 		for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t value;
 			value  = za.za_first_integer;
 
 			registered_count++;
 			tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
 			tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
 			tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
 			tb[ATTR_NUM(value)].sa_registered = B_TRUE;
 
 			if (tb[ATTR_NUM(value)].sa_name) {
 				continue;
 			}
 			tb[ATTR_NUM(value)].sa_name =
 			    kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
 			(void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
 			    strlen(za.za_name) +1);
 		}
 		zap_cursor_fini(&zc);
 		/*
 		 * Make sure we processed the correct number of registered
 		 * attributes
 		 */
 		if (registered_count != sa_reg_count) {
 			ASSERT(error != 0);
 			goto bail;
 		}
 
 	}
 
 	if (ostype == DMU_OST_ZFS) {
 		for (i = 0; i != sa_legacy_attr_count; i++) {
 			if (tb[i].sa_name)
 				continue;
 			tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
 			tb[i].sa_length = sa_legacy_attrs[i].sa_length;
 			tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
 			tb[i].sa_registered = B_FALSE;
 			tb[i].sa_name =
 			    kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
 			    KM_SLEEP);
 			(void) strlcpy(tb[i].sa_name,
 			    sa_legacy_attrs[i].sa_name,
 			    strlen(sa_legacy_attrs[i].sa_name) + 1);
 		}
 	}
 
 	for (i = 0; i != count; i++) {
 		sa_attr_type_t attr_id;
 
 		attr_id = sa->sa_user_table[i];
 		if (tb[attr_id].sa_name)
 			continue;
 
 		tb[attr_id].sa_length = reg_attrs[i].sa_length;
 		tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
 		tb[attr_id].sa_attr = attr_id;
 		tb[attr_id].sa_name =
 		    kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
 		(void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
 		    strlen(reg_attrs[i].sa_name) + 1);
 	}
 
 	sa->sa_need_attr_registration =
 	    (sa_attr_count != registered_count);
 
 	return (0);
 bail:
 	kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
 	sa->sa_user_table = NULL;
 	sa_free_attr_table(sa);
 	return ((error != 0) ? error : EINVAL);
 }
 
 int
 sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
     sa_attr_type_t **user_table)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	sa_os_t *sa;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 	sa_attr_type_t *tb;
 	int error;
 
 	mutex_enter(&os->os_user_ptr_lock);
 	if (os->os_sa) {
 		mutex_enter(&os->os_sa->sa_lock);
 		mutex_exit(&os->os_user_ptr_lock);
 		tb = os->os_sa->sa_user_table;
 		mutex_exit(&os->os_sa->sa_lock);
 		*user_table = tb;
 		return (0);
 	}
 
 	sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
 	mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	sa->sa_master_obj = sa_obj;
 
 	os->os_sa = sa;
 	mutex_enter(&sa->sa_lock);
 	mutex_exit(&os->os_user_ptr_lock);
 	avl_create(&sa->sa_layout_num_tree, layout_num_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
 	avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
 
 	if (sa_obj) {
 		error = zap_lookup(os, sa_obj, SA_LAYOUTS,
 		    8, 1, &sa->sa_layout_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 		error = zap_lookup(os, sa_obj, SA_REGISTRY,
 		    8, 1, &sa->sa_reg_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 	}
 
 	if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
 		goto fail;
 
 	if (sa->sa_layout_attr_obj != 0) {
 		uint64_t layout_count;
 
 		error = zap_count(os, sa->sa_layout_attr_obj,
 		    &layout_count);
 
 		/*
 		 * Layout number count should be > 0
 		 */
 		if (error || (error == 0 && layout_count == 0)) {
 			if (error == 0)
 				error = SET_ERROR(EINVAL);
 			goto fail;
 		}
 
 		for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			sa_attr_type_t *lot_attrs;
 			uint64_t lot_num;
 
 			lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
 			    za.za_num_integers, KM_SLEEP);
 
 			if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
 			    za.za_name, 2, za.za_num_integers,
 			    lot_attrs))) != 0) {
 				kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 				    za.za_num_integers);
 				break;
 			}
 			VERIFY(ddi_strtoull(za.za_name, NULL, 10,
 			    (unsigned long long *)&lot_num) == 0);
 
 			(void) sa_add_layout_entry(os, lot_attrs,
 			    za.za_num_integers, lot_num,
 			    sa_layout_info_hash(lot_attrs,
 			    za.za_num_integers), B_FALSE, NULL);
 			kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 			    za.za_num_integers);
 		}
 		zap_cursor_fini(&zc);
 
 		/*
 		 * Make sure layout count matches number of entries added
 		 * to AVL tree
 		 */
 		if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
 			ASSERT(error != 0);
 			goto fail;
 		}
 	}
 
 	/* Add special layout number for old ZNODES */
 	if (ostype == DMU_OST_ZFS) {
 		(void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
 		    sa_legacy_attr_count, 0,
 		    sa_layout_info_hash(sa_legacy_zpl_layout,
 		    sa_legacy_attr_count), B_FALSE, NULL);
 
 		(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
 		    0, B_FALSE, NULL);
 	}
 	*user_table = os->os_sa->sa_user_table;
 	mutex_exit(&sa->sa_lock);
 	return (0);
 fail:
 	os->os_sa = NULL;
 	sa_free_attr_table(sa);
 	if (sa->sa_user_table)
 		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 	mutex_exit(&sa->sa_lock);
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
 	mutex_destroy(&sa->sa_lock);
 	kmem_free(sa, sizeof (sa_os_t));
 	return ((error == ECKSUM) ? EIO : error);
 }
 
 void
 sa_tear_down(objset_t *os)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *layout;
 	void *cookie;
 
 	kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 
 	/* Free up attr table */
 
 	sa_free_attr_table(sa);
 
 	cookie = NULL;
 	while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
 		sa_idx_tab_t *tab;
 		while (tab = list_head(&layout->lot_idx_tab)) {
 			ASSERT(refcount_count(&tab->sa_refcount));
 			sa_idx_tab_rele(os, tab);
 		}
 	}
 
 	cookie = NULL;
 	while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
 		kmem_free(layout->lot_attrs,
 		    sizeof (sa_attr_type_t) * layout->lot_attr_count);
 		kmem_free(layout, sizeof (sa_lot_t));
 	}
 
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
 	mutex_destroy(&sa->sa_lock);
 
 	kmem_free(sa, sizeof (sa_os_t));
 	os->os_sa = NULL;
 }
 
 void
 sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t var_length, void *userp)
 {
 	sa_idx_tab_t *idx_tab = userp;
 
 	if (var_length) {
 		ASSERT(idx_tab->sa_variable_lengths);
 		idx_tab->sa_variable_lengths[length_idx] = length;
 	}
 	TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
 	    (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
 }
 
 static void
 sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
     sa_iterfunc_t func, sa_lot_t *tab, void *userp)
 {
 	void *data_start;
 	sa_lot_t *tb = tab;
 	sa_lot_t search;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	int i;
 	uint16_t *length_start = NULL;
 	uint8_t length_idx = 0;
 
 	if (tab == NULL) {
 		search.lot_num = SA_LAYOUT_NUM(hdr, type);
 		tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 		ASSERT(tb);
 	}
 
 	if (IS_SA_BONUSTYPE(type)) {
 		data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
 		    offsetof(sa_hdr_phys_t, sa_lengths) +
 		    (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
 		length_start = hdr->sa_lengths;
 	} else {
 		data_start = hdr;
 	}
 
 	for (i = 0; i != tb->lot_attr_count; i++) {
 		int attr_length, reg_length;
 		uint8_t idx_len;
 
 		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
 		if (reg_length) {
 			attr_length = reg_length;
 			idx_len = 0;
 		} else {
 			attr_length = length_start[length_idx];
 			idx_len = length_idx++;
 		}
 
 		func(hdr, data_start, tb->lot_attrs[i], attr_length,
 		    idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
 
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    attr_length), 8);
 	}
 }
 
 /*ARGSUSED*/
 void
 sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t variable_length, void *userp)
 {
 	sa_handle_t *hdl = userp;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
 }
 
 void
 sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 	dmu_buf_impl_t *db;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	int num_lengths = 1;
 	int i;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	if (sa_hdr_phys->sa_magic == SA_MAGIC)
 		return;
 
 	db = SA_GET_DB(hdl, buftype);
 
 	if (buftype == SA_SPILL) {
 		arc_release(db->db_buf, NULL);
 		arc_buf_thaw(db->db_buf);
 	}
 
 	sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
 	sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
 
 	/*
 	 * Determine number of variable lenghts in header
 	 * The standard 8 byte header has one for free and a
 	 * 16 byte header would have 4 + 1;
 	 */
 	if (SA_HDR_SIZE(sa_hdr_phys) > 8)
 		num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
 	for (i = 0; i != num_lengths; i++)
 		sa_hdr_phys->sa_lengths[i] =
 		    BSWAP_16(sa_hdr_phys->sa_lengths[i]);
 
 	sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
 	    sa_byteswap_cb, NULL, hdl);
 
 	if (buftype == SA_SPILL)
 		arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
 }
 
 static int
 sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys;
 	dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
 	dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_idx_tab_t *idx_tab;
 
 	sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 
 	mutex_enter(&sa->sa_lock);
 
 	/* Do we need to byteswap? */
 
 	/* only check if not old znode */
 	if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
 	    sa_hdr_phys->sa_magic != 0) {
 		VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
 		sa_byteswap(hdl, buftype);
 	}
 
 	idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
 
 	if (buftype == SA_BONUS)
 		hdl->sa_bonus_tab = idx_tab;
 	else
 		hdl->sa_spill_tab = idx_tab;
 
 	mutex_exit(&sa->sa_lock);
 	return (0);
 }
 
 /*ARGSUSED*/
 void
 sa_evict(dmu_buf_t *db, void *sap)
 {
 	panic("evicting sa dbuf %p\n", (void *)db);
 }
 
 static void
 sa_idx_tab_rele(objset_t *os, void *arg)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_idx_tab_t *idx_tab = arg;
 
 	if (idx_tab == NULL)
 		return;
 
 	mutex_enter(&sa->sa_lock);
 	if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
 		list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
 		if (idx_tab->sa_variable_lengths)
 			kmem_free(idx_tab->sa_variable_lengths,
 			    sizeof (uint16_t) *
 			    idx_tab->sa_layout->lot_var_sizes);
 		refcount_destroy(&idx_tab->sa_refcount);
 		kmem_free(idx_tab->sa_idx_tab,
 		    sizeof (uint32_t) * sa->sa_num_attrs);
 		kmem_free(idx_tab, sizeof (sa_idx_tab_t));
 	}
 	mutex_exit(&sa->sa_lock);
 }
 
 static void
 sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
 {
 	sa_os_t *sa = os->os_sa;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	(void) refcount_add(&idx_tab->sa_refcount, NULL);
 }
 
 void
 sa_handle_destroy(sa_handle_t *hdl)
 {
 	mutex_enter(&hdl->sa_lock);
 	(void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
-	    NULL, NULL, NULL);
+	    NULL, NULL);
 
 	if (hdl->sa_bonus_tab) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 		hdl->sa_bonus_tab = NULL;
 	}
 	if (hdl->sa_spill_tab) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 		hdl->sa_spill_tab = NULL;
 	}
 
 	dmu_buf_rele(hdl->sa_bonus, NULL);
 
 	if (hdl->sa_spill)
 		dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
 	mutex_exit(&hdl->sa_lock);
 
 	kmem_cache_free(sa_cache, hdl);
 }
 
 int
 sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	int error = 0;
 	dmu_object_info_t doi;
 	sa_handle_t *handle;
 
 #ifdef ZFS_DEBUG
 	dmu_object_info_from_db(db, &doi);
 	ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
 	    doi.doi_bonus_type == DMU_OT_ZNODE);
 #endif
 	/* find handle, if it exists */
 	/* if one doesn't exist then create a new one, and initialize it */
 
 	handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
 	if (handle == NULL) {
 		sa_handle_t *newhandle;
 		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
 		handle->sa_userp = userp;
 		handle->sa_bonus = db;
 		handle->sa_os = os;
 		handle->sa_spill = NULL;
 
 		error = sa_build_index(handle, SA_BONUS);
 		newhandle = (hdl_type == SA_HDL_SHARED) ?
-		    dmu_buf_set_user_ie(db, handle,
-		    NULL, sa_evict) : NULL;
+		    dmu_buf_set_user_ie(db, handle, sa_evict) : NULL;
 
 		if (newhandle != NULL) {
 			kmem_cache_free(sa_cache, handle);
 			handle = newhandle;
 		}
 	}
 	*handlepp = handle;
 
 	return (error);
 }
 
 int
 sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	dmu_buf_t *db;
 	int error;
 
 	if (error = dmu_bonus_hold(objset, objid, NULL, &db))
 		return (error);
 
 	return (sa_handle_get_from_db(objset, db, userp, hdl_type,
 	    handlepp));
 }
 
 int
 sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
 {
 	return (dmu_bonus_hold(objset, obj_num, tag, db));
 }
 
 void
 sa_buf_rele(dmu_buf_t *db, void *tag)
 {
 	dmu_buf_rele(db, tag);
 }
 
 int
 sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
 }
 
 int
 sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_attr = attr;
 	bulk.sa_data = buf;
 	bulk.sa_length = buflen;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_lookup_impl(hdl, &bulk, 1);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 #ifdef _KERNEL
 int
 sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
 		error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
 		    uio->uio_resid), UIO_READ, uio);
 	}
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 
 }
 #endif
 
 void *
 sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
 {
 	sa_idx_tab_t *idx_tab;
 	sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data;
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, search;
 	avl_index_t loc;
 
 	/*
 	 * Deterimine layout number.  If SA node and header == 0 then
 	 * force the index table to the dummy "1" empty layout.
 	 *
 	 * The layout number would only be zero for a newly created file
 	 * that has not added any attributes yet, or with crypto enabled which
 	 * doesn't write any attributes to the bonus buffer.
 	 */
 
 	search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
 
 	tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 
 	/* Verify header size is consistent with layout information */
 	ASSERT(tb);
 	ASSERT(IS_SA_BONUSTYPE(bonustype) &&
 	    SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
 	    (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
 
 	/*
 	 * See if any of the already existing TOC entries can be reused?
 	 */
 
 	for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
 	    idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
 		boolean_t valid_idx = B_TRUE;
 		int i;
 
 		if (tb->lot_var_sizes != 0 &&
 		    idx_tab->sa_variable_lengths != NULL) {
 			for (i = 0; i != tb->lot_var_sizes; i++) {
 				if (hdr->sa_lengths[i] !=
 				    idx_tab->sa_variable_lengths[i]) {
 					valid_idx = B_FALSE;
 					break;
 				}
 			}
 		}
 		if (valid_idx) {
 			sa_idx_tab_hold(os, idx_tab);
 			return (idx_tab);
 		}
 	}
 
 	/* No such luck, create a new entry */
 	idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
 	idx_tab->sa_idx_tab =
 	    kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
 	idx_tab->sa_layout = tb;
 	refcount_create(&idx_tab->sa_refcount);
 	if (tb->lot_var_sizes)
 		idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
 		    tb->lot_var_sizes, KM_SLEEP);
 
 	sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
 	    tb, idx_tab);
 	sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
 	sa_idx_tab_hold(os, idx_tab);	/* one for layout */
 	list_insert_tail(&tb->lot_idx_tab, idx_tab);
 	return (idx_tab);
 }
 
 void
 sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
     boolean_t start, void *userdata)
 {
 	ASSERT(start);
 
 	*dataptr = userdata;
 	*len = total_len;
 }
 
 static void
 sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
 {
 	uint64_t attr_value = 0;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_attr_table_t *tb = sa->sa_attr_table;
 	int i;
 
 	mutex_enter(&sa->sa_lock);
 
 	if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
 		mutex_exit(&sa->sa_lock);
 		return;
 	}
 
 	if (sa->sa_reg_attr_obj == 0) {
 		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
 		    DMU_OT_SA_ATTR_REGISTRATION,
 		    sa->sa_master_obj, SA_REGISTRY, tx);
 	}
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_registered)
 			continue;
 		ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
 		    tb[i].sa_byteswap);
 		VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
 		    tb[i].sa_name, 8, 1, &attr_value, tx));
 		tb[i].sa_registered = B_TRUE;
 	}
 	sa->sa_need_attr_registration = B_FALSE;
 	mutex_exit(&sa->sa_lock);
 }
 
 /*
  * Replace all attributes with attributes specified in template.
  * If dnode had a spill buffer then those attributes will be
  * also be replaced, possibly with just an empty spill block
  *
  * This interface is intended to only be used for bulk adding of
  * attributes for a new file.  It will also be used by the ZPL
  * when converting and old formatted znode to native SA support.
  */
 int
 sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 	return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
 }
 
 int
 sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_replace_all_by_template_locked(hdl, attr_desc,
 	    attr_count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Add/remove a single attribute or replace a variable-sized attribute value
  * with a value of a different size, and then rewrite the entire set
  * of attributes.
  * Same-length attribute value replacement (including fixed-length attributes)
  * is handled more efficiently by the upper layers.
  */
 static int
 sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 	dnode_t *dn;
 	sa_bulk_attr_t *attr_desc;
 	void *old_data[2];
 	int bonus_attr_count = 0;
 	int bonus_data_size = 0;
 	int spill_data_size = 0;
 	int spill_attr_count = 0;
 	int error;
 	uint16_t length;
 	int i, j, k, length_idx;
 	sa_hdr_phys_t *hdr;
 	sa_idx_tab_t *idx_tab;
 	int attr_count;
 	int count;
 
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	/* First make of copy of the old data */
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn->dn_bonuslen != 0) {
 		bonus_data_size = hdl->sa_bonus->db_size;
 		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
 		bcopy(hdl->sa_bonus->db_data, old_data[0],
 		    hdl->sa_bonus->db_size);
 		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
 	} else {
 		old_data[0] = NULL;
 	}
 	DB_DNODE_EXIT(db);
 
 	/* Bring spill buffer online if it isn't currently */
 
 	if ((error = sa_get_spill(hdl)) == 0) {
 		spill_data_size = hdl->sa_spill->db_size;
 		old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
 		bcopy(hdl->sa_spill->db_data, old_data[1],
 		    hdl->sa_spill->db_size);
 		spill_attr_count =
 		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
 	} else if (error && error != ENOENT) {
 		if (old_data[0])
 			kmem_free(old_data[0], bonus_data_size);
 		return (error);
 	} else {
 		old_data[1] = NULL;
 	}
 
 	/* build descriptor of all attributes */
 
 	attr_count = bonus_attr_count + spill_attr_count;
 	if (action == SA_ADD)
 		attr_count++;
 	else if (action == SA_REMOVE)
 		attr_count--;
 
 	attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
 
 	/*
 	 * loop through bonus and spill buffer if it exists, and
 	 * build up new attr_descriptor to reset the attributes
 	 */
 	k = j = 0;
 	count = bonus_attr_count;
 	hdr = SA_GET_HDR(hdl, SA_BONUS);
 	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
 	for (; k != 2; k++) {
 		/* iterate over each attribute in layout */
 		for (i = 0, length_idx = 0; i != count; i++) {
 			sa_attr_type_t attr;
 
 			attr = idx_tab->sa_layout->lot_attrs[i];
 			if (attr == newattr) {
 				/* duplicate attributes are not allowed */
 				ASSERT(action == SA_REPLACE ||
 				    action == SA_REMOVE);
 				/* must be variable-sized to be replaced here */
 				if (action == SA_REPLACE) {
 					ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
 					SA_ADD_BULK_ATTR(attr_desc, j, attr,
 					    locator, datastart, buflen);
 				}
 			} else {
 				length = SA_REGISTERED_LEN(sa, attr);
 				if (length == 0) {
 					length = hdr->sa_lengths[length_idx];
 				}
 
 				SA_ADD_BULK_ATTR(attr_desc, j, attr,
 				    NULL, (void *)
 				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
 				    (uintptr_t)old_data[k]), length);
 			}
 			if (SA_REGISTERED_LEN(sa, attr) == 0)
 				length_idx++;
 		}
 		if (k == 0 && hdl->sa_spill) {
 			hdr = SA_GET_HDR(hdl, SA_SPILL);
 			idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
 			count = spill_attr_count;
 		} else {
 			break;
 		}
 	}
 	if (action == SA_ADD) {
 		length = SA_REGISTERED_LEN(sa, newattr);
 		if (length == 0) {
 			length = buflen;
 		}
 		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
 		    datastart, buflen);
 	}
 	ASSERT3U(j, ==, attr_count);
 
 	error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
 
 	if (old_data[0])
 		kmem_free(old_data[0], bonus_data_size);
 	if (old_data[1])
 		kmem_free(old_data[1], spill_data_size);
 	kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
 
 	return (error);
 }
 
 static int
 sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     dmu_tx_t *tx)
 {
 	int error;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_object_type_t bonustype;
 
 	bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
 
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	/* sync out registration table if necessary */
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 
 	error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
 	if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
 		sa->sa_update_cb(hdl, tx);
 
 	return (error);
 }
 
 /*
  * update or add new attribute
  */
 int
 sa_update(sa_handle_t *hdl, sa_attr_type_t type,
     void *buf, uint32_t buflen, dmu_tx_t *tx)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_attr = type;
 	bulk.sa_data_func = NULL;
 	bulk.sa_length = buflen;
 	bulk.sa_data = buf;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
     uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_attr = attr;
 	bulk.sa_data = userdata;
 	bulk.sa_data_func = locator;
 	bulk.sa_length = buflen;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Return size of an attribute
  */
 
 int
 sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
 {
 	sa_bulk_attr_t bulk;
 	int error;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
 		mutex_exit(&hdl->sa_lock);
 		return (error);
 	}
 	*size = bulk.sa_size;
 
 	mutex_exit(&hdl->sa_lock);
 	return (0);
 }
 
 int
 sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_lookup_impl(hdl, attrs, count));
 }
 
 int
 sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_lookup_locked(hdl, attrs, count);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, attrs, count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
 	    NULL, 0, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 void
 sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
 {
 	dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
 }
 
 void
 sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
 {
 	dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
 	    blksize, nblocks);
 }
 
 void
 sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
 {
 	(void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
-	    oldhdl, newhdl, NULL, sa_evict);
+	    oldhdl, newhdl, sa_evict);
 	oldhdl->sa_bonus = NULL;
 }
 
 void
 sa_set_userp(sa_handle_t *hdl, void *ptr)
 {
 	hdl->sa_userp = ptr;
 }
 
 dmu_buf_t *
 sa_get_db(sa_handle_t *hdl)
 {
 	return ((dmu_buf_t *)hdl->sa_bonus);
 }
 
 void *
 sa_get_userdata(sa_handle_t *hdl)
 {
 	return (hdl->sa_userp);
 }
 
 void
 sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
 {
 	ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
 	os->os_sa->sa_update_cb = func;
 }
 
 void
 sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
 {
 
 	mutex_enter(&os->os_sa->sa_lock);
 	sa_register_update_callback_locked(os, func);
 	mutex_exit(&os->os_sa->sa_lock);
 }
 
 uint64_t
 sa_handle_object(sa_handle_t *hdl)
 {
 	return (hdl->sa_bonus->db_object);
 }
 
 boolean_t
 sa_enabled(objset_t *os)
 {
 	return (os->os_sa == NULL);
 }
 
 int
 sa_set_sa_object(objset_t *os, uint64_t sa_object)
 {
 	sa_os_t *sa = os->os_sa;
 
 	if (sa->sa_master_obj)
 		return (1);
 
 	sa->sa_master_obj = sa_object;
 
 	return (0);
 }
 
 int
 sa_hdrsize(void *arg)
 {
 	sa_hdr_phys_t *hdr = arg;
 
 	return (SA_HDR_SIZE(hdr));
 }
 
 void
 sa_handle_lock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 }
 
 void
 sa_handle_unlock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_exit(&hdl->sa_lock);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(revision 275782)
@@ -1,6993 +1,6995 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013, 2014, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  */
 
 /*
  * SPA: Storage Pool Allocator
  *
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
  * pool.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 #include <sys/zvol.h>
 #include <sys/trim_map.h>
 
 #ifdef	_KERNEL
 #include <sys/callb.h>
 #include <sys/cpupart.h>
 #include <sys/zone.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
 /* Check hostid on import? */
 static int check_hostid = 1;
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
  * should be retried.
  */
 static int zfs_ccw_retry_interval = 300;
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
     "Check hostid on import?");
 TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
     &zfs_ccw_retry_interval, 0,
     "Configuration cache file write, retry after failure, interval (seconds)");
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
 	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
 #define	ZTI_ONE		ZTI_N(1)
 
 typedef struct zio_taskq_info {
 	zti_modes_t zti_mode;
 	uint_t zti_value;
 	uint_t zti_count;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 	"issue", "issue_high", "intr", "intr_high"
 };
 
 /*
  * This table defines the taskq settings for each ZFS I/O type. When
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  * macros. Other operations process a large amount of data; the ZTI_BATCH
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
  * particular taskq is chosen at random.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
  * need to be handled with minimum delay.
  */
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
 	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(8),	ZTI_N(5) }, /* WRITE */
 	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
 #ifdef PSRSET_BIND
 id_t		zio_taskq_psrset_bind = PS_NONE;
 #endif
 #ifdef SYSDC
 boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
 #endif
 uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
 
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
 extern int	zfs_sync_pass_deferred_free;
 
 #ifndef illumos
 extern void spa_deadman(void *arg);
 #endif
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
  */
 #define	TRYIMPORT_NAME	"$import"
 
 /*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
  */
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	const char *propname = zpool_prop_to_name(prop);
 	nvlist_t *propval;
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 
 	if (strval != NULL)
 		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 	else
 		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 
 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 	nvlist_free(propval);
 }
 
 /*
  * Get property values from the spa configuration.
  */
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
 	uint64_t size, alloc, cap, version;
 	zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 	metaslab_class_t *mc = spa_normal_class(spa);
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
 	if (rvd != NULL) {
 		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 		size = metaslab_class_get_space(spa_normal_class(spa));
 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
 		    metaslab_class_expandable_space(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == FREAD), src);
 
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 			src = ZPROP_SRC_DEFAULT;
 		else
 			src = ZPROP_SRC_LOCAL;
 		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 	}
 
 	if (pool != NULL) {
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
 		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
-			    pool->dp_free_dir->dd_phys->dd_used_bytes, src);
+			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
+			    src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 
 		if (pool->dp_leak_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
-			    pool->dp_leak_dir->dd_phys->dd_used_bytes, src);
+			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
+			    src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
 			    NULL, 0, src);
 		}
 	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 		    0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
 	}
 
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    "none", 0, ZPROP_SRC_LOCAL);
 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
 		}
 	}
 }
 
 /*
  * Get zpool property values.
  */
 int
 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int err;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	mutex_enter(&spa->spa_props_lock);
 
 	/*
 	 * Get properties from the spa config.
 	 */
 	spa_prop_get_config(spa, nvp);
 
 	/* If no pool property object, no more prop to get. */
 	if (mos == NULL || spa->spa_pool_props_object == 0) {
 		mutex_exit(&spa->spa_props_lock);
 		return (0);
 	}
 
 	/*
 	 * Get properties from the MOS pool property object.
 	 */
 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t intval = 0;
 		char *strval = NULL;
 		zprop_source_t src = ZPROP_SRC_DEFAULT;
 		zpool_prop_t prop;
 
 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
 			continue;
 
 		switch (za.za_integer_length) {
 		case 8:
 			/* integer property */
 			if (za.za_first_integer !=
 			    zpool_prop_default_numeric(prop))
 				src = ZPROP_SRC_LOCAL;
 
 			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_pool_t *dp;
 				dsl_dataset_t *ds = NULL;
 
 				dp = spa_get_dsl(spa);
 				dsl_pool_config_enter(dp, FTAG);
 				if (err = dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &ds)) {
 					dsl_pool_config_exit(dp, FTAG);
 					break;
 				}
 
 				strval = kmem_alloc(
 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
 				dsl_pool_config_exit(dp, FTAG);
 			} else {
 				strval = NULL;
 				intval = za.za_first_integer;
 			}
 
 			spa_prop_add_list(*nvp, prop, strval, intval, src);
 
 			if (strval != NULL)
 				kmem_free(strval,
 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
 
 			break;
 
 		case 1:
 			/* string property */
 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 			err = zap_lookup(mos, spa->spa_pool_props_object,
 			    za.za_name, 1, za.za_num_integers, strval);
 			if (err) {
 				kmem_free(strval, za.za_num_integers);
 				break;
 			}
 			spa_prop_add_list(*nvp, prop, strval, 0, src);
 			kmem_free(strval, za.za_num_integers);
 			break;
 
 		default:
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 	mutex_exit(&spa->spa_props_lock);
 out:
 	if (err && err != ENOENT) {
 		nvlist_free(*nvp);
 		*nvp = NULL;
 		return (err);
 	}
 
 	return (0);
 }
 
 /*
  * Validate the given pool properties nvlist and modify the list
  * for the property values to be set.
  */
 static int
 spa_prop_validate(spa_t *spa, nvlist_t *props)
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
 	uint64_t objnum = 0;
 	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		uint64_t intval;
 		char *strval, *slash, *check, *fname;
 		const char *propname = nvpair_name(elem);
 		zpool_prop_t prop = zpool_name_to_prop(propname);
 
 		switch (prop) {
 		case ZPROP_INVAL:
 			if (!zpool_prop_feature(propname)) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			/*
 			 * Sanitize the input.
 			 */
 			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			if (intval != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			fname = strchr(propname, '@') + 1;
 			if (zfeature_lookup_name(fname, NULL) != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			has_feature = B_TRUE;
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
 			    (intval < spa_version(spa) ||
 			    intval > SPA_VERSION_BEFORE_FEATURES ||
 			    has_feature))
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_DELEGATION:
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
 		case ZPOOL_PROP_AUTOEXPAND:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
 			/*
 			 * If the pool version is less than SPA_VERSION_BOOTFS,
 			 * or the pool is still being created (version == 0),
 			 * the bootfs property cannot be set.
 			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			/*
 			 * Make sure the vdev config is bootable
 			 */
 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
 				objset_t *os;
 				uint64_t propval;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
 					    ZPOOL_PROP_BOOTFS);
 					break;
 				}
 
 				if (error = dmu_objset_hold(strval, FTAG, &os))
 					break;
 
 				/*
 				 * Must be ZPL, and its property settings
 				 * must be supported by GRUB (compression
 				 * is not gzip, and large blocks are not used).
 				 */
 
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
 					error = SET_ERROR(ENOTSUP);
 				} else if ((error =
 				    dsl_prop_get_int_ds(dmu_objset_ds(os),
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    &propval)) == 0 &&
 				    !BOOTFS_COMPRESS_VALID(propval)) {
 					error = SET_ERROR(ENOTSUP);
 				} else if ((error =
 				    dsl_prop_get_int_ds(dmu_objset_ds(os),
 				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 				    &propval)) == 0 &&
 				    propval > SPA_OLD_MAXBLOCKSIZE) {
 					error = SET_ERROR(ENOTSUP);
 				} else {
 					objnum = dmu_objset_id(os);
 				}
 				dmu_objset_rele(os, FTAG);
 			}
 			break;
 
 		case ZPOOL_PROP_FAILUREMODE:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 			    intval > ZIO_FAILURE_MODE_PANIC))
 				error = SET_ERROR(EINVAL);
 
 			/*
 			 * This is a special case which only occurs when
 			 * the pool has completely failed. This allows
 			 * the user to change the in-core failmode property
 			 * without syncing it out to disk (I/Os might
 			 * currently be blocked). We do this by returning
 			 * EIO to the caller (spa_prop_set) to trick it
 			 * into thinking we encountered a property validation
 			 * error.
 			 */
 			if (!error && spa_suspended(spa)) {
 				spa->spa_failmode = intval;
 				error = SET_ERROR(EIO);
 			}
 			break;
 
 		case ZPOOL_PROP_CACHEFILE:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 
 			if (strval[0] == '\0')
 				break;
 
 			if (strcmp(strval, "none") == 0)
 				break;
 
 			if (strval[0] != '/') {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			slash = strrchr(strval, '/');
 			ASSERT(slash != NULL);
 
 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 			    strcmp(slash, "/..") == 0)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_COMMENT:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 			for (check = strval; *check != '\0'; check++) {
 				/*
 				 * The kernel doesn't have an easy isprint()
 				 * check.  For this kernel check, we merely
 				 * check ASCII apart from DEL.  Fix this if
 				 * there is an easy-to-use kernel isprint().
 				 */
 				if (*check >= 0x7f) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 				check++;
 			}
 			if (strlen(strval) > ZPROP_MAX_COMMENT)
 				error = E2BIG;
 			break;
 
 		case ZPOOL_PROP_DEDUPDITTO:
 			if (spa_version(spa) < SPA_VERSION_DEDUP)
 				error = SET_ERROR(ENOTSUP);
 			else
 				error = nvpair_value_uint64(elem, &intval);
 			if (error == 0 &&
 			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
 				error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (error)
 			break;
 	}
 
 	if (!error && reset_bootfs) {
 		error = nvlist_remove(props,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 
 		if (!error) {
 			error = nvlist_add_uint64(props,
 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 		}
 	}
 
 	return (error);
 }
 
 void
 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 {
 	char *cachefile;
 	spa_config_dirent_t *dp;
 
 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 	    &cachefile) != 0)
 		return;
 
 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
 	    KM_SLEEP);
 
 	if (cachefile[0] == '\0')
 		dp->scd_path = spa_strdup(spa_config_path);
 	else if (strcmp(cachefile, "none") == 0)
 		dp->scd_path = NULL;
 	else
 		dp->scd_path = spa_strdup(cachefile);
 
 	list_insert_head(&spa->spa_config_list, dp);
 	if (need_sync)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
 	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
 		if (prop == ZPOOL_PROP_CACHEFILE ||
 		    prop == ZPOOL_PROP_ALTROOT ||
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
 		if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
 			uint64_t ver;
 
 			if (prop == ZPOOL_PROP_VERSION) {
 				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 			} else {
 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
 				ver = SPA_VERSION_FEATURES;
 				need_sync = B_TRUE;
 			}
 
 			/* Save time if the version is already set. */
 			if (ver == spa_version(spa))
 				continue;
 
 			/*
 			 * In addition to the pool directory object, we might
 			 * create the pool properties object, the features for
 			 * read object, the features for write object, or the
 			 * feature descriptions object.
 			 */
 			error = dsl_sync_task(spa->spa_name, NULL,
 			    spa_sync_version, &ver,
 			    6, ZFS_SPACE_CHECK_RESERVED);
 			if (error)
 				return (error);
 			continue;
 		}
 
 		need_sync = B_TRUE;
 		break;
 	}
 
 	if (need_sync) {
 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	return (0);
 }
 
 /*
  * If the bootfs property value is dsobj, clear it.
  */
 void
 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 {
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
 
 /*ARGSUSED*/
 static int
 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (vdev_state != VDEV_STATE_HEALTHY)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3U(spa_guid(spa), !=, *newguid);
 
 	return (0);
 }
 
 static void
 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	uint64_t oldguid;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	oldguid = spa_guid(spa);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	rvd->vdev_guid = *newguid;
 	rvd->vdev_guid_sum += (*newguid - oldguid);
 	vdev_config_dirty(rvd);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 	    oldguid, *newguid);
 }
 
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
  * the root vdev's guid, our own pool guid, and then mark all of our
  * vdevs dirty.  Note that we must make sure that all our vdevs are
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
  */
 int
 spa_change_guid(spa_t *spa)
 {
 	int error;
 	uint64_t guid;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 	guid = spa_generate_guid(NULL);
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == 0) {
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 		spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
  * ==========================================================================
  */
 
 static int
 spa_error_entry_compare(const void *a, const void *b)
 {
 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
 	int ret;
 
 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_phys_t));
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Utility function which retrieves copies of the current logs and
  * re-initializes them in the process.
  */
 void
 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 {
 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 
 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 static void
 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 	enum zti_modes mode = ztip->zti_mode;
 	uint_t value = ztip->zti_value;
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	char name[32];
 	uint_t flags = 0;
 	boolean_t batch = B_FALSE;
 
 	if (mode == ZTI_MODE_NULL) {
 		tqs->stqs_count = 0;
 		tqs->stqs_taskq = NULL;
 		return;
 	}
 
 	ASSERT3U(count, >, 0);
 
 	tqs->stqs_count = count;
 	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >=, 1);
 		value = MAX(value, 1);
 		break;
 
 	case ZTI_MODE_BATCH:
 		batch = B_TRUE;
 		flags |= TASKQ_THREADS_CPU_PCT;
 		value = zio_taskq_batch_pct;
 		break;
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 		    "spa_activate()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
 
 	for (uint_t i = 0; i < count; i++) {
 		taskq_t *tq;
 
 		if (count > 1) {
 			(void) snprintf(name, sizeof (name), "%s_%s_%u",
 			    zio_type_name[t], zio_taskq_types[q], i);
 		} else {
 			(void) snprintf(name, sizeof (name), "%s_%s",
 			    zio_type_name[t], zio_taskq_types[q]);
 		}
 
 #ifdef SYSDC
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 			if (batch)
 				flags |= TASKQ_DC_BATCH;
 
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 #endif
 			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly lower priority
 			 * than the other taskqs.
 			 */
 			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
 				pri--;
 
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 #ifdef SYSDC
 		}
 #endif
 
 		tqs->stqs_taskq[i] = tq;
 	}
 }
 
 static void
 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 
 	if (tqs->stqs_taskq == NULL) {
 		ASSERT0(tqs->stqs_count);
 		return;
 	}
 
 	for (uint_t i = 0; i < tqs->stqs_count; i++) {
 		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 		taskq_destroy(tqs->stqs_taskq[i]);
 	}
 
 	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 	tqs->stqs_taskq = NULL;
 }
 
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
  * on the taskq itself. In that case we choose which taskq at random by using
  * the low bits of gethrtime().
  */
 void
 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
 
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else {
 #ifdef _KERNEL
 		tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count];
 #else
 		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
 #endif
 	}
 
 	taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
 static void
 spa_create_zio_taskqs(spa_t *spa)
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_init(spa, t, q);
 		}
 	}
 }
 
 #ifdef _KERNEL
 #ifdef SPA_PROCESS
 static void
 spa_thread(void *arg)
 {
 	callb_cpr_t cprinfo;
 
 	spa_t *spa = arg;
 	user_t *pu = PTOU(curproc);
 
 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 	    spa->spa_name);
 
 	ASSERT(curproc != &p0);
 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 	    "zpool-%s", spa->spa_name);
 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 
 #ifdef PSRSET_BIND
 	/* bind this thread to the requested psrset */
 	if (zio_taskq_psrset_bind != PS_NONE) {
 		pool_lock();
 		mutex_enter(&cpu_lock);
 		mutex_enter(&pidlock);
 		mutex_enter(&curproc->p_lock);
 
 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 		    0, NULL, NULL) == 0)  {
 			curthread->t_bind_pset = zio_taskq_psrset_bind;
 		} else {
 			cmn_err(CE_WARN,
 			    "Couldn't bind process for zfs pool \"%s\" to "
 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 		}
 
 		mutex_exit(&curproc->p_lock);
 		mutex_exit(&pidlock);
 		mutex_exit(&cpu_lock);
 		pool_unlock();
 	}
 #endif
 
 #ifdef SYSDC
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
 #endif
 
 	spa->spa_proc = curproc;
 	spa->spa_did = curthread->t_did;
 
 	spa_create_zio_taskqs(spa);
 
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 
 	spa->spa_proc_state = SPA_PROC_ACTIVE;
 	cv_broadcast(&spa->spa_proc_cv);
 
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 
 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 	spa->spa_proc_state = SPA_PROC_GONE;
 	spa->spa_proc = &p0;
 	cv_broadcast(&spa->spa_proc_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
 
 	mutex_enter(&curproc->p_lock);
 	lwp_exit();
 }
 #endif	/* SPA_PROCESS */
 #endif
 
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, int mode)
 {
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 
 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 	ASSERT(spa->spa_proc == &p0);
 	spa->spa_did = 0;
 
 #ifdef SPA_PROCESS
 	/* Only create a process if we're going to be around a while. */
 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 		    NULL, 0) == 0) {
 			spa->spa_proc_state = SPA_PROC_CREATED;
 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
 				cv_wait(&spa->spa_proc_cv,
 				    &spa->spa_proc_lock);
 			}
 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 			ASSERT(spa->spa_proc != &p0);
 			ASSERT(spa->spa_did != 0);
 		} else {
 #ifdef _KERNEL
 			cmn_err(CE_WARN,
 			    "Couldn't create process for zfs pool \"%s\"\n",
 			    spa->spa_name);
 #endif
 		}
 	}
 #endif	/* SPA_PROCESS */
 	mutex_exit(&spa->spa_proc_lock);
 
 	/* If we didn't create a process, we need to create our taskqs. */
 	ASSERT(spa->spa_proc == &p0);
 	if (spa->spa_proc == &p0) {
 		spa_create_zio_taskqs(spa);
 	}
 
 	/*
 	 * Start TRIM thread.
 	 */
 	trim_thread_create(spa);
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list,
 	    offsetof(struct vdev, vdev_txg_node));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 /*
  * Opposite of spa_activate().
  */
 static void
 spa_deactivate(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
 	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	/*
 	 * Stop TRIM thread in case spa_unload() wasn't called directly
 	 * before spa_deactivate().
 	 */
 	trim_thread_destroy(spa);
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_fini(spa, t, q);
 		}
 	}
 
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
 	metaslab_class_destroy(spa->spa_log_class);
 	spa->spa_log_class = NULL;
 
 	/*
 	 * If this was part of an import or the open otherwise failed, we may
 	 * still have errors left in the queues.  Empty them just in case.
 	 */
 	spa_errlog_drain(spa);
 
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 
 	mutex_enter(&spa->spa_proc_lock);
 	if (spa->spa_proc_state != SPA_PROC_NONE) {
 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 		cv_broadcast(&spa->spa_proc_cv);
 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 			ASSERT(spa->spa_proc != &p0);
 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 		}
 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 		spa->spa_proc_state = SPA_PROC_NONE;
 	}
 	ASSERT(spa->spa_proc == &p0);
 	mutex_exit(&spa->spa_proc_lock);
 
 #ifdef SPA_PROCESS
 	/*
 	 * We want to make sure spa_thread() has actually exited the ZFS
 	 * module, so that the module can't be unloaded out from underneath
 	 * it.
 	 */
 	if (spa->spa_did != 0) {
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
 #endif	/* SPA_PROCESS */
 }
 
 /*
  * Verify a pool configuration, and construct the vdev tree appropriately.  This
  * will create all the necessary vdevs in the appropriate layout, with each vdev
  * in the CLOSED state.  This will prep the pool before open/creation/import.
  * All vdev validation is done by the vdev_alloc() routine.
  */
 static int
 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
 	nvlist_t **child;
 	uint_t children;
 	int error;
 
 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 		return (error);
 
 	if ((*vdp)->vdev_ops->vdev_op_leaf)
 		return (0);
 
 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *vd;
 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 		    atype)) != 0) {
 			vdev_free(*vdp);
 			*vdp = NULL;
 			return (error);
 		}
 	}
 
 	ASSERT(*vdp != NULL);
 
 	return (0);
 }
 
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
 	int i;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Stop TRIM thread.
 	 */
 	trim_thread_destroy(spa);
 
 	/*
 	 * Stop async tasks.
 	 */
 	spa_async_suspend(spa);
 
 	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
 		txg_sync_stop(spa->spa_dsl_pool);
 		spa->spa_sync_on = B_FALSE;
 	}
 
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
 		for (int i = 0; i < max_ncpus; i++)
 			(void) zio_wait(spa->spa_async_zio_root[i]);
 		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
 		spa->spa_async_zio_root = NULL;
 	}
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
 	 */
 	if (spa->spa_root_vdev)
 		vdev_free(spa->spa_root_vdev);
 	ASSERT(spa->spa_root_vdev == NULL);
 
 	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
 		dsl_pool_close(spa->spa_dsl_pool);
 		spa->spa_dsl_pool = NULL;
 		spa->spa_meta_objset = NULL;
 	}
 
 	ddt_unload(spa);
 
 
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
 	}
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 	}
 	spa->spa_spares.sav_count = 0;
 
 	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 	}
 	if (spa->spa_l2cache.sav_vdevs) {
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 	}
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
 
 	if (spa->spa_comment != NULL) {
 		spa_strfree(spa->spa_comment);
 		spa->spa_comment = NULL;
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active spares for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  */
 static void
 spa_load_spares(spa_t *spa)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int i;
 	vdev_t *vd, *tvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		vd = spa->spa_spares.sav_vdevs[i];
 
 		/* Undo the call to spa_activate() below */
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL && tvd->vdev_isspare)
 			spa_spare_remove(tvd);
 		vdev_close(vd);
 		vdev_free(vd);
 	}
 
 	if (spa->spa_spares.sav_vdevs)
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
 	else
 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 	spa->spa_spares.sav_count = (int)nspares;
 	spa->spa_spares.sav_vdevs = NULL;
 
 	if (nspares == 0)
 		return;
 
 	/*
 	 * Construct the array of vdevs, opening them to get status in the
 	 * process.   For each spare, there is potentially two different vdev_t
 	 * structures associated with it: one in the list of spares (used only
 	 * for basic validation purposes) and one in the active vdev
 	 * configuration (if it's spared in).  During this phase we open and
 	 * validate each vdev on the spare list.  If the vdev also exists in the
 	 * active configuration, then we also mark this vdev as an active spare.
 	 */
 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE) == 0);
 		ASSERT(vd != NULL);
 
 		spa->spa_spares.sav_vdevs[i] = vd;
 
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL) {
 			if (!tvd->vdev_isspare)
 				spa_spare_add(tvd);
 
 			/*
 			 * We only mark the spare active if we were successfully
 			 * able to load the vdev.  Otherwise, importing a pool
 			 * with a bad active spare would result in strange
 			 * behavior, because multiple pool would think the spare
 			 * is actively in use.
 			 *
 			 * There is a vulnerability here to an equally bizarre
 			 * circumstance, where a dead active spare is later
 			 * brought back to life (onlined or otherwise).  Given
 			 * the rarity of this scenario, and the extra complexity
 			 * it adds, we ignore the possibility.
 			 */
 			if (!vdev_is_dead(tvd))
 				spa_spare_activate(tvd);
 		}
 
 		vd->vdev_top = vd;
 		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
 
 		if (vdev_validate_aux(vd) == 0)
 			spa_spare_add(vd);
 	}
 
 	/*
 	 * Recompute the stashed list of spares, with status information
 	 * this time.
 	 */
 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active l2cache for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  * Devices which are already active have their details maintained, and are
  * not re-opened.
  */
 static void
 spa_load_l2cache(spa_t *spa)
 {
 	nvlist_t **l2cache;
 	uint_t nl2cache;
 	int i, j, oldnvdevs;
 	uint64_t guid;
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (sav->sav_config != NULL) {
 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 	} else {
 		nl2cache = 0;
 		newvdevs = NULL;
 	}
 
 	oldvdevs = sav->sav_vdevs;
 	oldnvdevs = sav->sav_count;
 	sav->sav_vdevs = NULL;
 	sav->sav_count = 0;
 
 	/*
 	 * Process new nvlist of vdevs.
 	 */
 	for (i = 0; i < nl2cache; i++) {
 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
 		    &guid) == 0);
 
 		newvdevs[i] = NULL;
 		for (j = 0; j < oldnvdevs; j++) {
 			vd = oldvdevs[j];
 			if (vd != NULL && guid == vd->vdev_guid) {
 				/*
 				 * Retain previous vdev for add/remove ops.
 				 */
 				newvdevs[i] = vd;
 				oldvdevs[j] = NULL;
 				break;
 			}
 		}
 
 		if (newvdevs[i] == NULL) {
 			/*
 			 * Create new vdev
 			 */
 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
 			    VDEV_ALLOC_L2CACHE) == 0);
 			ASSERT(vd != NULL);
 			newvdevs[i] = vd;
 
 			/*
 			 * Commit this vdev as an l2cache device,
 			 * even if it fails to open.
 			 */
 			spa_l2cache_add(vd);
 
 			vd->vdev_top = vd;
 			vd->vdev_aux = sav;
 
 			spa_l2cache_activate(vd);
 
 			if (vdev_open(vd) != 0)
 				continue;
 
 			(void) vdev_validate_aux(vd);
 
 			if (!vdev_is_dead(vd))
 				l2arc_add_vdev(spa, vd);
 		}
 	}
 
 	/*
 	 * Purge vdevs that were dropped
 	 */
 	for (i = 0; i < oldnvdevs; i++) {
 		uint64_t pool;
 
 		vd = oldvdevs[i];
 		if (vd != NULL) {
 			ASSERT(vd->vdev_isl2cache);
 
 			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 			    pool != 0ULL && l2arc_vdev_present(vd))
 				l2arc_remove_vdev(vd);
 			vdev_clear_stats(vd);
 			vdev_free(vd);
 		}
 	}
 
 	if (oldvdevs)
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
 
 	if (sav->sav_config == NULL)
 		goto out;
 
 	sav->sav_vdevs = newvdevs;
 	sav->sav_count = (int)nl2cache;
 
 	/*
 	 * Recompute the stashed list of l2cache devices, with status
 	 * information this time.
 	 */
 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
 out:
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);
 	if (sav->sav_count)
 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
 }
 
 static int
 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 {
 	dmu_buf_t *db;
 	char *packed = NULL;
 	size_t nvsize = 0;
 	int error;
 	*value = NULL;
 
 	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
 	if (error != 0)
 		return (error);
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
 	packed = kmem_alloc(nvsize, KM_SLEEP);
 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
 	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	kmem_free(packed, nvsize);
 
 	return (error);
 }
 
 /*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
  */
 static void
 spa_check_removed(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
 	    !vd->vdev_ishole) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
 /*
  * Validate the current config against the MOS config
  */
 static boolean_t
 spa_config_valid(spa_t *spa, nvlist_t *config)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
 	nvlist_t *nv;
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
 
 	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
 
 	/*
 	 * If we're doing a normal import, then build up any additional
 	 * diagnostic information about missing devices in this config.
 	 * We'll pass this up to the user for further processing.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
 		nvlist_t **child, *nv;
 		uint64_t idx = 0;
 
 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
 		    KM_SLEEP);
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 			vdev_t *mtvd  = mrvd->vdev_child[c];
 
 			if (tvd->vdev_ops == &vdev_missing_ops &&
 			    mtvd->vdev_ops != &vdev_missing_ops &&
 			    mtvd->vdev_islog)
 				child[idx++] = vdev_config_generate(spa, mtvd,
 				    B_FALSE, 0);
 		}
 
 		if (idx) {
 			VERIFY(nvlist_add_nvlist_array(nv,
 			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
 
 			for (int i = 0; i < idx; i++)
 				nvlist_free(child[i]);
 		}
 		nvlist_free(nv);
 		kmem_free(child, rvd->vdev_children * sizeof (char **));
 	}
 
 	/*
 	 * Compare the root vdev tree with the information we have
 	 * from the MOS config (mrvd). Check each top-level vdev
 	 * with the corresponding MOS config top-level (mtvd).
 	 */
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		vdev_t *mtvd  = mrvd->vdev_child[c];
 
 		/*
 		 * Resolve any "missing" vdevs in the current configuration.
 		 * If we find that the MOS config has more accurate information
 		 * about the top-level vdev then use that vdev instead.
 		 */
 		if (tvd->vdev_ops == &vdev_missing_ops &&
 		    mtvd->vdev_ops != &vdev_missing_ops) {
 
 			if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
 				continue;
 
 			/*
 			 * Device specific actions.
 			 */
 			if (mtvd->vdev_islog) {
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
 			} else {
 				/*
 				 * XXX - once we have 'readonly' pool
 				 * support we should be able to handle
 				 * missing data devices by transitioning
 				 * the pool to readonly.
 				 */
 				continue;
 			}
 
 			/*
 			 * Swap the missing vdev with the data we were
 			 * able to obtain from the MOS config.
 			 */
 			vdev_remove_child(rvd, tvd);
 			vdev_remove_child(mrvd, mtvd);
 
 			vdev_add_child(rvd, mtvd);
 			vdev_add_child(mrvd, tvd);
 
 			spa_config_exit(spa, SCL_ALL, FTAG);
 			vdev_load(mtvd);
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 			vdev_reopen(rvd);
 		} else if (mtvd->vdev_islog) {
 			/*
 			 * Load the slog device's state from the MOS config
 			 * since it's possible that the label does not
 			 * contain the most up-to-date information.
 			 */
 			vdev_load_log_state(tvd, mtvd);
 			vdev_reopen(tvd);
 		}
 	}
 	vdev_free(mrvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Ensure we were able to validate the config.
 	 */
 	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
 }
 
 /*
  * Check for missing log devices
  */
 static boolean_t
 spa_check_logs(spa_t *spa)
 {
 	boolean_t rv = B_FALSE;
 
 	switch (spa->spa_log_state) {
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
 		rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
 		    NULL, DS_FIND_CHILDREN) != 0);
 		if (rv)
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 		break;
 	}
 	return (rv);
 }
 
 static boolean_t
 spa_passivate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	boolean_t slog_found = B_FALSE;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	if (!spa_has_slogs(spa))
 		return (B_FALSE);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog) {
 			metaslab_group_passivate(mg);
 			slog_found = B_TRUE;
 		}
 	}
 
 	return (slog_found);
 }
 
 static void
 spa_activate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog)
 			metaslab_group_activate(mg);
 	}
 }
 
 int
 spa_offline_log(spa_t *spa)
 {
 	int error;
 
 	error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
 	    NULL, DS_FIND_CHILDREN);
 	if (error == 0) {
 		/*
 		 * We successfully offlined the log device, sync out the
 		 * current txg so that the "stubby" block can be removed
 		 * by zil_sync().
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 	return (error);
 }
 
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
 	int i;
 
 	for (i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
 void
 spa_claim_notify(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	if (zio->io_error)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
 	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
 		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
 	mutex_exit(&spa->spa_props_lock);
 }
 
 typedef struct spa_load_error {
 	uint64_t	sle_meta_count;
 	uint64_t	sle_data_count;
 } spa_load_error_t;
 
 static void
 spa_load_verify_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
 	spa_t *spa = zio->io_spa;
 
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
 			atomic_inc_64(&sle->sle_meta_count);
 		else
 			atomic_inc_64(&sle->sle_data_count);
 	}
 	zio_data_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /*
  * Maximum number of concurrent scrub i/os to create while verifying
  * a pool while importing it.
  */
 int spa_load_verify_maxinflight = 10000;
 boolean_t spa_load_verify_metadata = B_TRUE;
 boolean_t spa_load_verify_data = B_TRUE;
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
     &spa_load_verify_maxinflight, 0,
     "Maximum number of concurrent scrub I/Os to create while verifying a "
     "pool while importing it");
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
     &spa_load_verify_metadata, 0,
     "Check metadata on import?");
  
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
     &spa_load_verify_data, 0,
     "Check user data on import?");
  
 /*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 	/*
 	 * Note: normally this routine will not be called if
 	 * spa_load_verify_metadata is not set.  However, it may be useful
 	 * to manually set the flag after the traversal has begun.
 	 */
 	if (!spa_load_verify_metadata)
 		return (0);
 	if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
 		return (0);
 
 	zio_t *rio = arg;
 	size_t size = BP_GET_PSIZE(bp);
 	void *data = zio_data_buf_alloc(size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	spa->spa_scrub_inflight++;
 	mutex_exit(&spa->spa_scrub_lock);
 
 	zio_nowait(zio_read(rio, spa, bp, data, size,
 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	return (0);
 }
 
 static int
 spa_load_verify(spa_t *spa)
 {
 	zio_t *rio;
 	spa_load_error_t sle = { 0 };
 	zpool_rewind_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
 	int error = 0;
 
 	zpool_get_rewind_policy(spa->spa_config, &policy);
 
 	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
 		return (0);
 
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
 	if (spa_load_verify_metadata) {
 		error = traverse_pool(spa, spa->spa_verify_min_txg,
 		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
 		    spa_load_verify_cb, rio);
 	}
 
 	(void) zio_wait(rio);
 
 	spa->spa_load_meta_errors = sle.sle_meta_count;
 	spa->spa_load_data_errors = sle.sle_data_count;
 
 	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
 	    sle.sle_data_count <= policy.zrp_maxdata) {
 		int64_t loss = 0;
 
 		verify_ok = B_TRUE;
 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
 		VERIFY(nvlist_add_int64(spa->spa_load_info,
 		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 	}
 
 	if (error) {
 		if (error != ENXIO && error != EIO)
 			error = SET_ERROR(EIO);
 		return (error);
 	}
 
 	return (verify_ok ? 0 : EIO);
 }
 
 /*
  * Find a value in the pool props object.
  */
 static void
 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
 {
 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
 }
 
 /*
  * Find a value in the pool directory object.
  */
 static int
 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
 {
 	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    name, sizeof (uint64_t), 1, val));
 }
 
 static int
 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 {
 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
 	return (err);
 }
 
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
  * pool have that entry in their config, but only the splitting one contains
  * a list of all the guids of the vdevs that are being split off.
  *
  * This function determines what to do with that list: either rejoin
  * all the disks to the pool, or complete the splitting process.  To attempt
  * the rejoin, each disk that is offlined is marked online again, and
  * we do a reopen() call.  If the vdev label for every disk that was
  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
  * then we call vdev_split() on each disk, and complete the split.
  *
  * Otherwise we leave the config alone, with all the vdevs in place in
  * the original pool.
  */
 static void
 spa_try_repair(spa_t *spa, nvlist_t *config)
 {
 	uint_t extracted;
 	uint64_t *glist;
 	uint_t i, gcount;
 	nvlist_t *nvl;
 	vdev_t **vd;
 	boolean_t attempt_reopen;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
 		return;
 
 	/* check that the config is complete */
 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    &glist, &gcount) != 0)
 		return;
 
 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
 
 	/* attempt to online all the vdevs & validate */
 	attempt_reopen = B_TRUE;
 	for (i = 0; i < gcount; i++) {
 		if (glist[i] == 0)	/* vdev is hole */
 			continue;
 
 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
 		if (vd[i] == NULL) {
 			/*
 			 * Don't bother attempting to reopen the disks;
 			 * just do the split.
 			 */
 			attempt_reopen = B_FALSE;
 		} else {
 			/* attempt to re-online it */
 			vd[i]->vdev_offline = B_FALSE;
 		}
 	}
 
 	if (attempt_reopen) {
 		vdev_reopen(spa->spa_root_vdev);
 
 		/* check each device to see what state it's in */
 		for (extracted = 0, i = 0; i < gcount; i++) {
 			if (vd[i] != NULL &&
 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
 				break;
 			++extracted;
 		}
 	}
 
 	/*
 	 * If every disk has been moved to the new pool, or if we never
 	 * even attempted to look at them, then we split them off for
 	 * good.
 	 */
 	if (!attempt_reopen || gcount == extracted) {
 		for (i = 0; i < gcount; i++)
 			if (vd[i] != NULL)
 				vdev_split(vd[i]);
 		vdev_reopen(spa->spa_root_vdev);
 	}
 
 	kmem_free(vd, gcount * sizeof (vdev_t *));
 }
 
 static int
 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
     boolean_t mosconfig)
 {
 	nvlist_t *config = spa->spa_config;
 	char *ereport = FM_EREPORT_ZFS_POOL;
 	char *comment;
 	int error;
 	uint64_t pool_guid;
 	nvlist_t *nvl;
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
 		return (SET_ERROR(EINVAL));
 
 	ASSERT(spa->spa_comment == NULL);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 		spa->spa_comment = spa_strdup(comment);
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
 	 * it's not present treat it as the initial version.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &spa->spa_ubsync.ub_version) != 0)
 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
 		error = SET_ERROR(EEXIST);
 	} else {
 		spa->spa_config_guid = pool_guid;
 
 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
 		    &nvl) == 0) {
 			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
 			    KM_SLEEP) == 0);
 		}
 
 		nvlist_free(spa->spa_load_info);
 		spa->spa_load_info = fnvlist_alloc();
 
 		gethrestime(&spa->spa_loaded_ts);
 		error = spa_load_impl(spa, pool_guid, config, state, type,
 		    mosconfig, &ereport);
 	}
 
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 	if (error) {
 		if (error != EEXIST) {
 			spa->spa_loaded_ts.tv_sec = 0;
 			spa->spa_loaded_ts.tv_nsec = 0;
 		}
 		if (error != EBADF) {
 			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
 		}
 	}
 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
 	return (error);
 }
 
 /*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
 static int
 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport)
 {
 	int error = 0;
 	nvlist_t *nvroot = NULL;
 	nvlist_t *label;
 	vdev_t *rvd;
 	uberblock_t *ub = &spa->spa_uberblock;
 	uint64_t children, config_cache_txg = spa->spa_config_txg;
 	int orig_mode = spa->spa_mode;
 	int parse;
 	uint64_t obj;
 	boolean_t missing_feat_write = B_FALSE;
 
 	/*
 	 * If this is an untrusted config, access the pool in read-only mode.
 	 * This prevents things like resilvering recently removed devices.
 	 */
 	if (!mosconfig)
 		spa->spa_mode = FREAD;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa->spa_load_state = state;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
 		return (SET_ERROR(EINVAL));
 
 	parse = (type == SPA_IMPORT_EXISTING ?
 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT(spa->spa_root_vdev == rvd);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
 	}
 
 	/*
 	 * Try to open all vdevs, loading each label in the process.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * We need to validate the vdev labels against the configuration that
 	 * we have in hand, which is dependent on the setting of mosconfig. If
 	 * mosconfig is true then we're validating the vdev labels based on
 	 * that config.  Otherwise, we're validating against the cached config
 	 * (zpool.cache) that was read when we loaded the zfs module, and then
 	 * later we will recursively call spa_load() and validate against
 	 * the vdev config.
 	 *
 	 * If we're assembling a new pool that's been split off from an
 	 * existing pool, the labels haven't yet been updated so we skip
 	 * validation for now.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		error = vdev_validate(rvd, mosconfig);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 
 		if (error != 0)
 			return (error);
 
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
 			return (SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Find the best uberblock.
 	 */
 	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
 		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
 	}
 
 	/*
 	 * If the pool has an unsupported version we can't open it.
 	 */
 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
 		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
 	}
 
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *features;
 
 		/*
 		 * If we weren't able to find what's necessary for reading the
 		 * MOS in the label, return failure.
 		 */
 		if (label == NULL || nvlist_lookup_nvlist(label,
 		    ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
 			nvlist_free(label);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		/*
 		 * Update our in-core representation with the definitive values
 		 * from the label.
 		 */
 		nvlist_free(spa->spa_label_features);
 		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Look through entries in the label nvlist's features_for_read. If
 	 * there is a feature listed there which we don't understand then we
 	 * cannot open a pool.
 	 */
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *unsup_feat;
 
 		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
 		    0);
 
 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
 		    NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
 			if (!zfeature_is_supported(nvpair_name(nvp))) {
 				VERIFY(nvlist_add_string(unsup_feat,
 				    nvpair_name(nvp), "") == 0);
 			}
 		}
 
 		if (!nvlist_empty(unsup_feat)) {
 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
 			nvlist_free(unsup_feat);
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		nvlist_free(unsup_feat);
 	}
 
 	/*
 	 * If the vdev guid sum doesn't match the uberblock, we have an
 	 * incomplete configuration.  We first check to see if the pool
 	 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
 	 * If it is, defer the vdev_guid_sum check till later so we
 	 * can handle missing vdevs.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 	    &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
 	    rvd->vdev_guid_sum != ub->ub_guid_sum)
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_try_repair(spa, config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		nvlist_free(spa->spa_config_splitting);
 		spa->spa_config_splitting = NULL;
 	}
 
 	/*
 	 * Initialize internal SPA structures.
 	 */
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
 	spa->spa_prev_software_version = ub->ub_software_version;
 
 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
 		boolean_t missing_feat_read = B_FALSE;
 		nvlist_t *unsup_feat, *enabled_feat;
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
 		    &spa->spa_feat_for_read_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
 		    &spa->spa_feat_for_write_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
 		    &spa->spa_feat_desc_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		enabled_feat = fnvlist_alloc();
 		unsup_feat = fnvlist_alloc();
 
 		if (!spa_features_check(spa, B_FALSE,
 		    unsup_feat, enabled_feat))
 			missing_feat_read = B_TRUE;
 
 		if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
 			if (!spa_features_check(spa, B_TRUE,
 			    unsup_feat, enabled_feat)) {
 				missing_feat_write = B_TRUE;
 			}
 		}
 
 		fnvlist_add_nvlist(spa->spa_load_info,
 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 		}
 
 		fnvlist_free(enabled_feat);
 		fnvlist_free(unsup_feat);
 
 		if (!missing_feat_read) {
 			fnvlist_add_boolean(spa->spa_load_info,
 			    ZPOOL_CONFIG_CAN_RDONLY);
 		}
 
 		/*
 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
 		 * twofold: to determine whether the pool is available for
 		 * import in read-write mode and (if it is not) whether the
 		 * pool is available for import in read-only mode. If the pool
 		 * is available for import in read-write mode, it is displayed
 		 * as available in userland; if it is not available for import
 		 * in read-only mode, it is displayed as unavailable in
 		 * userland. If the pool is available for import in read-only
 		 * mode but not read-write mode, it is displayed as unavailable
 		 * in userland with a special note that the pool is actually
 		 * available for open in read-only mode.
 		 *
 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
 		 * missing a feature for write, we must first determine whether
 		 * the pool can be opened read-only before returning to
 		 * userland in order to know whether to display the
 		 * abovementioned note.
 		 */
 		if (missing_feat_read || (missing_feat_write &&
 		    spa_writeable(spa))) {
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		/*
 		 * Load refcounts for ZFS features from disk into an in-memory
 		 * cache during SPA initialization.
 		 */
 		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 			uint64_t refcount;
 
 			error = feature_get_refcount_from_disk(spa,
 			    &spa_feature_table[i], &refcount);
 			if (error == 0) {
 				spa->spa_feat_refcount_cache[i] = refcount;
 			} else if (error == ENOTSUP) {
 				spa->spa_feat_refcount_cache[i] =
 				    SPA_FEATURE_DISABLED;
 			} else {
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 		}
 	}
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
 		    &spa->spa_feat_enabled_txg_obj) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	error = dsl_pool_open(spa->spa_dsl_pool);
 	spa->spa_is_initializing = B_FALSE;
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (!mosconfig) {
 		uint64_t hostid;
 		nvlist_t *policy = NULL, *nvconfig;
 
 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 			char *hostname;
 			unsigned long myhostid = 0;
 
 			VERIFY(nvlist_lookup_string(nvconfig,
 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
 
 #ifdef	_KERNEL
 			myhostid = zone_get_hostid(NULL);
 #else	/* _KERNEL */
 			/*
 			 * We're emulating the system's hostid in userland, so
 			 * we can't use zone_get_hostid().
 			 */
 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
 #endif	/* _KERNEL */
 			if (check_hostid && hostid != 0 && myhostid != 0 &&
 			    hostid != myhostid) {
 				nvlist_free(nvconfig);
 				cmn_err(CE_WARN, "pool '%s' could not be "
 				    "loaded as it was last accessed by "
 				    "another system (host: %s hostid: 0x%lx). "
 				    "See: http://illumos.org/msg/ZFS-8000-EY",
 				    spa_name(spa), hostname,
 				    (unsigned long)hostid);
 				return (SET_ERROR(EBADF));
 			}
 		}
 		if (nvlist_lookup_nvlist(spa->spa_config,
 		    ZPOOL_REWIND_POLICY, &policy) == 0)
 			VERIFY(nvlist_add_nvlist(nvconfig,
 			    ZPOOL_REWIND_POLICY, policy) == 0);
 
 		spa_config_set(spa, nvconfig);
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_activate(spa, orig_mode);
 
 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
 	}
 
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the bit that tells us to use the new accounting function
 	 * (raid-z deflation).  If we have an older pool, this will not
 	 * be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
 	    &spa->spa_creation_version);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
 	    &spa->spa_errlog_scrub);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * If we're assembling the pool from the split-off vdevs of
 	 * an existing pool, we don't want to attach the spares & cache
 	 * devices.
 	 */
 
 	/*
 	 * Load any hot spares for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 		if (load_nvlist(spa, spa->spa_spares.sav_object,
 		    &spa->spa_spares.sav_config) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Load any level 2 ARC devices for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 	    &spa->spa_l2cache.sav_object);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
 		    &spa->spa_l2cache.sav_config) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
 	if (error && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (error == 0) {
 		uint64_t autoreplace;
 
 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
 		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
 		    &spa->spa_dedup_ditto);
 
 		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
 	/*
 	 * If the 'autoreplace' property is set, then post a resource notifying
 	 * the ZFS DE that it should not issue any faults for unopenable
 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
 	 * unopenable vdevs so that the normal autoreplace handler can take
 	 * over.
 	 */
 	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
 		spa_check_removed(spa->spa_root_vdev);
 		/*
 		 * For the import case, this is done in spa_import(), because
 		 * at this point we're using the spare definitions from
 		 * the MOS config, not necessarily from the userland config.
 		 */
 		if (state != SPA_LOAD_IMPORT) {
 			spa_aux_check_removed(&spa->spa_spares);
 			spa_aux_check_removed(&spa->spa_l2cache);
 		}
 	}
 
 	/*
 	 * Load the vdev state for all toplevel vdevs.
 	 */
 	vdev_load(rvd);
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Load the DDTs (dedup tables).
 	 */
 	error = ddt_load(spa);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	spa_update_dspace(spa);
 
 	/*
 	 * Validate the config, using the MOS config to fill in any
 	 * information which might be missing.  If we fail to validate
 	 * the config then declare the pool unfit for use. If we're
 	 * assembling a pool from a split, the log is not transferred
 	 * over.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		nvlist_t *nvconfig;
 
 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		if (!spa_config_valid(spa, nvconfig)) {
 			nvlist_free(nvconfig);
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
 			    ENXIO));
 		}
 		nvlist_free(nvconfig);
 
 		/*
 		 * Now that we've validated the config, check the state of the
 		 * root vdev.  If it can't be opened, it indicates one or
 		 * more toplevel vdevs are faulted.
 		 */
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
 			return (SET_ERROR(ENXIO));
 
 		if (spa_check_logs(spa)) {
 			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
 		}
 	}
 
 	if (missing_feat_write) {
 		ASSERT(state == SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * At this point, we know that we can open the pool in
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
 		return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
 	}
 
 	/*
 	 * We've successfully opened the pool, verify that we're ready
 	 * to start pushing transactions.
 	 */
 	if (state != SPA_LOAD_TRYIMPORT) {
 		if (error = spa_load_verify(spa))
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    error));
 	}
 
 	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
 
 		ASSERT(state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Claim log blocks that haven't been committed yet.
 		 * This must all happen in a single txg.
 		 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
 		 * invoked from zil_claim_log_block()'s i/o done callback.
 		 * Price of rollback is that we abandon the log.
 		 */
 		spa->spa_claiming = B_TRUE;
 
 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		(void) dmu_objset_find(spa_name(spa),
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
 		spa->spa_claiming = B_FALSE;
 
 		spa_set_log_state(spa, SPA_LOG_GOOD);
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 
 		/*
 		 * Wait for all claims to sync.  We sync up to the highest
 		 * claimed log block birth time so that claimed log blocks
 		 * don't appear to be from the future.  spa_claim_max_txg
 		 * will have been set for us by either zil_check_log_chain()
 		 * (invoked from spa_check_logs()) or zil_claim() above.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * If the config cache is stale, or we have uninitialized
 		 * metaslabs (see spa_vdev_add()), then update the config.
 		 *
 		 * If this is a verbatim import, trust the current
 		 * in-core spa_config and update the disk labels.
 		 */
 		if (config_cache_txg != spa->spa_config_txg ||
 		    state == SPA_LOAD_IMPORT ||
 		    state == SPA_LOAD_RECOVER ||
 		    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 			need_update = B_TRUE;
 
 		for (int c = 0; c < rvd->vdev_children; c++)
 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
 				need_update = B_TRUE;
 
 		/*
 		 * Update the config cache asychronously in case we're the
 		 * root pool, in which case the config cache isn't writable yet.
 		 */
 		if (need_update)
 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 
 		/*
 		 * Check all DTLs to see if anything needs resilvering.
 		 */
 		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    vdev_resilver_needed(rvd, NULL, NULL))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 		/*
 		 * Log the fact that we booted up (so that we can detect if
 		 * we rebooted in the middle of an operation).
 		 */
 		spa_history_log_version(spa, "open");
 
 		/*
 		 * Delete any inconsistent datasets.
 		 */
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 	}
 
 	return (0);
 }
 
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
 {
 	int mode = spa->spa_mode;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 
 	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
 
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
 }
 
 /*
  * If spa_load() fails this function will try loading prior txg's. If
  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
  * function will not rewind the pool and will return the same error as
  * spa_load().
  */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
     uint64_t max_request, int rewind_flags)
 {
 	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
 	uint64_t min_txg;
 
 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
 		spa->spa_load_max_txg = spa->spa_load_txg;
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
 		if (max_request != UINT64_MAX)
 			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
 	    mosconfig);
 	if (load_error == 0)
 		return (0);
 
 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
 		nvlist_free(config);
 		return (load_error);
 	}
 
 	if (state == SPA_LOAD_RECOVER) {
 		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		/*
 		 * If we aren't rolling back save the load info from our first
 		 * import attempt so that we can restore it after attempting
 		 * to rewind.
 		 */
 		loadinfo = spa->spa_load_info;
 		spa->spa_load_info = fnvlist_alloc();
 	}
 
 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 	    TXG_INITIAL : safe_rewind_txg;
 
 	/*
 	 * Continue as long as we're finding errors, we're still within
 	 * the acceptable rewind range, and we're still finding uberblocks
 	 */
 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 		if (spa->spa_load_max_txg < safe_rewind_txg)
 			spa->spa_extreme_rewind = B_TRUE;
 		rewind_error = spa_load_retry(spa, state, mosconfig);
 	}
 
 	spa->spa_extreme_rewind = B_FALSE;
 	spa->spa_load_max_txg = UINT64_MAX;
 
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 
 	if (state == SPA_LOAD_RECOVER) {
 		ASSERT3P(loadinfo, ==, NULL);
 		return (rewind_error);
 	} else {
 		/* Store the rewind info as part of the initial load info */
 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
 		    spa->spa_load_info);
 
 		/* Restore the initial load info */
 		fnvlist_free(spa->spa_load_info);
 		spa->spa_load_info = loadinfo;
 
 		return (load_error);
 	}
 }
 
 /*
  * Pool Open/Import
  *
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
  * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
  * ambiguous state.
  */
 static int
 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
     nvlist_t **config)
 {
 	spa_t *spa;
 	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 	int firstopen = B_FALSE;
 
 	*spapp = NULL;
 
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
 	 * up calling spa_open() again.  The real fix is to figure out how to
 	 * avoid dsl_dir_open() calling this in the first place.
 	 */
 	if (mutex_owner(&spa_namespace_lock) != curthread) {
 		mutex_enter(&spa_namespace_lock);
 		locked = B_TRUE;
 	}
 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
 			mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		zpool_rewind_policy_t policy;
 
 		firstopen = B_TRUE;
 
 		zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
 		    &policy);
 		if (policy.zrp_request & ZPOOL_DO_REWIND)
 			state = SPA_LOAD_RECOVER;
 
 		spa_activate(spa, spa_mode_global);
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 
 		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
 		    policy.zrp_request);
 
 		if (error == EBADF) {
 			/*
 			 * If vdev_validate() returns failure (indicated by
 			 * EBADF), it indicates that one of the vdevs indicates
 			 * that the pool has been exported or destroyed.  If
 			 * this is the case, the config cache is out of sync and
 			 * we should remove the pool from the namespace.
 			 */
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa_config_sync(spa, B_TRUE, B_TRUE);
 			spa_remove(spa);
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(ENOENT));
 		}
 
 		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
 			if (config != NULL && spa->spa_config) {
 				VERIFY(nvlist_dup(spa->spa_config, config,
 				    KM_SLEEP) == 0);
 				VERIFY(nvlist_add_nvlist(*config,
 				    ZPOOL_CONFIG_LOAD_INFO,
 				    spa->spa_load_info) == 0);
 			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
 		}
 	}
 
 	spa_open_ref(spa, tag);
 
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	/*
 	 * If we've recovered the pool, pass back any information we
 	 * gathered while doing the load.
 	 */
 	if (state == SPA_LOAD_RECOVER) {
 		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 	}
 
 	if (locked) {
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
 		mutex_exit(&spa_namespace_lock);
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 		if (firstopen)
 			zvol_create_minors(spa->spa_name);
 #endif
 #endif
 	}
 
 	*spapp = spa;
 
 	return (0);
 }
 
 int
 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
     nvlist_t **config)
 {
 	return (spa_open_common(name, spapp, tag, policy, config));
 }
 
 int
 spa_open(const char *name, spa_t **spapp, void *tag)
 {
 	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
  * Lookup the given spa_t, incrementing the inject count in the process,
  * preventing it from being exported or destroyed.
  */
 spa_t *
 spa_inject_addref(char *name)
 {
 	spa_t *spa;
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(name)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (NULL);
 	}
 	spa->spa_inject_ref++;
 	mutex_exit(&spa_namespace_lock);
 
 	return (spa);
 }
 
 void
 spa_inject_delref(spa_t *spa)
 {
 	mutex_enter(&spa_namespace_lock);
 	spa->spa_inject_ref--;
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * Add spares device information to the nvlist.
  */
 static void
 spa_add_spares(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **spares;
 	uint_t i, nspares;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_stat_t *vs;
 	uint_t vsc;
 	uint64_t pool;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 	if (nspares != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 		/*
 		 * Go through and find any spares which have since been
 		 * repurposed as an active spare.  If this is the case, update
 		 * their status appropriately.
 		 */
 		for (i = 0; i < nspares; i++) {
 			VERIFY(nvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				VERIFY(nvlist_lookup_uint64_array(
 				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
 				    (uint64_t **)&vs, &vsc) == 0);
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
 			}
 		}
 	}
 }
 
 /*
  * Add l2cache device information to the nvlist, including vdev stats.
  */
 static void
 spa_add_l2cache(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **l2cache;
 	uint_t i, j, nl2cache;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_t *vd;
 	vdev_stat_t *vs;
 	uint_t vsc;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 	if (nl2cache != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 
 		/*
 		 * Update level 2 cache device stats.
 		 */
 
 		for (i = 0; i < nl2cache; i++) {
 			VERIFY(nvlist_lookup_uint64(l2cache[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 
 			vd = NULL;
 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
 				if (guid ==
 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
 					vd = spa->spa_l2cache.sav_vdevs[j];
 					break;
 				}
 			}
 			ASSERT(vd != NULL);
 
 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
 			    == 0);
 			vdev_get_stats(vd, vs);
 		}
 	}
 }
 
 static void
 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t *features;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	/* We may be unable to read features if pool is suspended. */
 	if (spa_suspended(spa))
 		goto out;
 
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	if (spa->spa_feat_for_write_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_write_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 out:
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 	    features) == 0);
 	nvlist_free(features);
 }
 
 int
 spa_get_stats(const char *name, nvlist_t **config,
     char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
 
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
 		 * This still leaves a window of inconsistency where the spares
 		 * or l2cache devices could change and the config would be
 		 * self-inconsistent.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		if (*config != NULL) {
 			uint64_t loadtimes[2];
 
 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
 			VERIFY(nvlist_add_uint64_array(*config,
 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
 
 			VERIFY(nvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
 			    spa_get_errlog_size(spa)) == 0);
 
 			if (spa_suspended(spa))
 				VERIFY(nvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED,
 				    spa->spa_failmode) == 0);
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
 			spa_add_feature_stats(spa, *config);
 		}
 	}
 
 	/*
 	 * We want to get the alternate root even for faulted pools, so we cheat
 	 * and call spa_lookup() directly.
 	 */
 	if (altroot) {
 		if (spa == NULL) {
 			mutex_enter(&spa_namespace_lock);
 			spa = spa_lookup(name);
 			if (spa)
 				spa_altroot(spa, altroot, buflen);
 			else
 				altroot[0] = '\0';
 			spa = NULL;
 			mutex_exit(&spa_namespace_lock);
 		} else {
 			spa_altroot(spa, altroot, buflen);
 		}
 	}
 
 	if (spa != NULL) {
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Validate that the auxiliary device array is well formed.  We must have an
  * array of nvlists, each which describes a valid leaf vdev.  If this is an
  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
  * specified, as long as they are well-formed.
  */
 static int
 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
     spa_aux_vdev_t *sav, const char *config, uint64_t version,
     vdev_labeltype_t label)
 {
 	nvlist_t **dev;
 	uint_t i, ndev;
 	vdev_t *vd;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * It's acceptable to have no devs specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 		return (0);
 
 	if (ndev == 0)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure the pool is formatted with a version that supports this
 	 * device type.
 	 */
 	if (spa_version(spa) < version)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the pending device list so we correctly handle device in-use
 	 * checking.
 	 */
 	sav->sav_pending = dev;
 	sav->sav_npending = ndev;
 
 	for (i = 0; i < ndev; i++) {
 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 		    mode)) != 0)
 			goto out;
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		/*
 		 * The L2ARC currently only supports disk devices in
 		 * kernel context.  For user-level testing, we allow it.
 		 */
 #ifdef _KERNEL
 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
 			error = SET_ERROR(ENOTBLK);
 			vdev_free(vd);
 			goto out;
 		}
 #endif
 		vd->vdev_top = vd;
 
 		if ((error = vdev_open(vd)) == 0 &&
 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 			    vd->vdev_guid) == 0);
 		}
 
 		vdev_free(vd);
 
 		if (error &&
 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 			goto out;
 		else
 			error = 0;
 	}
 
 out:
 	sav->sav_pending = NULL;
 	sav->sav_npending = 0;
 	return (error);
 }
 
 static int
 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 {
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
 	    VDEV_LABEL_SPARE)) != 0) {
 		return (error);
 	}
 
 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
 	    VDEV_LABEL_L2CACHE));
 }
 
 static void
 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
     const char *config)
 {
 	int i;
 
 	if (sav->sav_config != NULL) {
 		nvlist_t **olddevs;
 		uint_t oldndevs;
 		nvlist_t **newdevs;
 
 		/*
 		 * Generate new dev list by concatentating with the
 		 * current dev list.
 		 */
 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
 		    &olddevs, &oldndevs) == 0);
 
 		newdevs = kmem_alloc(sizeof (void *) *
 		    (ndevs + oldndevs), KM_SLEEP);
 		for (i = 0; i < oldndevs; i++)
 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
 			    KM_SLEEP) == 0);
 		for (i = 0; i < ndevs; i++)
 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
 			    KM_SLEEP) == 0);
 
 		VERIFY(nvlist_remove(sav->sav_config, config,
 		    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 		    config, newdevs, ndevs + oldndevs) == 0);
 		for (i = 0; i < oldndevs + ndevs; i++)
 			nvlist_free(newdevs[i]);
 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
 	} else {
 		/*
 		 * Generate a new dev list.
 		 */
 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
 		    devs, ndevs) == 0);
 	}
 }
 
 /*
  * Stop and drop level 2 ARC devices
  */
 void
 spa_l2cache_drop(spa_t *spa)
 {
 	vdev_t *vd;
 	int i;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		uint64_t pool;
 
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
 	}
 }
 
 /*
  * Pool Creation
  */
 int
 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
 	int error = 0;
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 	uint64_t version, obj;
 	boolean_t has_features;
 
 	/*
 	 * If this pool already exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Allocate a new spa_t structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(pool, NULL, altroot);
 	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	has_features = B_FALSE;
 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
 		if (zpool_prop_feature(nvpair_name(elem)))
 			has_features = B_TRUE;
 	}
 
 	if (has_features || nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
 	}
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Create the root vdev.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 
 	ASSERT(error != 0 || rvd != NULL);
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
 	if (error == 0 && !zfs_allocatable_devs(nvroot))
 		error = SET_ERROR(EINVAL);
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_ashift_optimize(rvd->vdev_child[c]);
 			vdev_metaslab_set_size(rvd->vdev_child[c]);
 			vdev_expand(rvd->vdev_child[c], txg);
 		}
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Get the list of spares, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Get the list of level 2 cache devices, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
 	spa->spa_meta_objset = dp->dp_meta_objset;
 	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
 
 	spa_update_dspace(spa);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Create the pool config object.
 	 */
 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES)
 		spa_feature_create_zap_objects(spa, tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 	    sizeof (uint64_t), 1, &version, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool version");
 	}
 
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
 		if (zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
 			cmn_err(CE_PANIC, "failed to add deflate");
 		}
 	}
 
 	/*
 	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
 	dmu_object_set_compress(spa->spa_meta_objset, obj,
 	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
 	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Create the pool's history object.
 	 */
 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
 		spa_history_create_obj(spa, tx);
 
 	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(props, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(spa->spa_dsl_pool);
 
 	/*
 	 * We explicitly wait for the first transaction to complete so that our
 	 * bean counters are appropriately updated.
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	spa_config_sync(spa, B_FALSE, B_TRUE);
 
 	spa_history_log_version(spa, "create");
 
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 #ifdef _KERNEL
 #if defined(sun)
 /*
  * Get the root pool information from the root disk, then import the root pool
  * during the system boot up time.
  */
 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
 
 static nvlist_t *
 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
 {
 	nvlist_t *config;
 	nvlist_t *nvtop, *nvroot;
 	uint64_t pgid;
 
 	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
 		return (NULL);
 
 	/*
 	 * Add this top-level vdev to the child array.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &pgid) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
 
 	/*
 	 * Put this pool's top-level vdevs into a root vdev.
 	 */
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &nvtop, 1) == 0);
 
 	/*
 	 * Replace the existing vdev_tree with the new root vdev in
 	 * this pool's configuration (remove the old, add the new).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
 	return (config);
 }
 
 /*
  * Walk the vdev tree and see if we can find a device with "better"
  * configuration. A configuration is "better" if the label on that
  * device has a more recent txg.
  */
 static void
 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t *label;
 		uint64_t label_txg;
 
 		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
 		    &label) != 0)
 			return;
 
 		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
 		    &label_txg) == 0);
 
 		/*
 		 * Do we have a better boot device?
 		 */
 		if (label_txg > *txg) {
 			*txg = label_txg;
 			*avd = vd;
 		}
 		nvlist_free(label);
 	}
 }
 
 /*
  * Import a root pool.
  *
  * For x86. devpath_list will consist of devid and/or physpath name of
  * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
  * The GRUB "findroot" command will return the vdev we should boot.
  *
  * For Sparc, devpath_list consists the physpath name of the booting device
  * no matter the rootpool is a single device pool or a mirrored pool.
  * e.g.
  *	"/pci@1f,0/ide@d/disk@0,0:a"
  */
 int
 spa_import_rootpool(char *devpath, char *devid)
 {
 	spa_t *spa;
 	vdev_t *rvd, *bvd, *avd = NULL;
 	nvlist_t *config, *nvtop;
 	uint64_t guid, txg;
 	char *pname;
 	int error;
 
 	/*
 	 * Read the label from the boot device and generate a configuration.
 	 */
 	config = spa_generate_rootconf(devpath, devid, &guid);
 #if defined(_OBP) && defined(_KERNEL)
 	if (config == NULL) {
 		if (strstr(devpath, "/iscsi/ssd") != NULL) {
 			/* iscsi boot */
 			get_iscsi_bootpath_phy(devpath);
 			config = spa_generate_rootconf(devpath, devid, &guid);
 		}
 	}
 #endif
 	if (config == NULL) {
 		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
 		    devpath);
 		return (SET_ERROR(EIO));
 	}
 
 	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 	    &pname) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pname)) != NULL) {
 		/*
 		 * Remove the existing root pool from the namespace so that we
 		 * can replace it with the correct config we just read in.
 		 */
 		spa_remove(spa);
 	}
 
 	spa = spa_add(pname, config, NULL);
 	spa->spa_is_root = B_TRUE;
 	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
 	/*
 	 * Build up a vdev tree based on the boot device's label config.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
 	    VDEV_ALLOC_ROOTPOOL);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error) {
 		mutex_exit(&spa_namespace_lock);
 		nvlist_free(config);
 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
 		    pname);
 		return (error);
 	}
 
 	/*
 	 * Get the boot vdev.
 	 */
 	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
 		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
 		    (u_longlong_t)guid);
 		error = SET_ERROR(ENOENT);
 		goto out;
 	}
 
 	/*
 	 * Determine if there is a better boot device.
 	 */
 	avd = bvd;
 	spa_alt_rootvdev(rvd, &avd, &txg);
 	if (avd != bvd) {
 		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
 		    "try booting from '%s'", avd->vdev_path);
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * If the boot device is part of a spare vdev then ensure that
 	 * we're booting off the active spare.
 	 */
 	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    !bvd->vdev_isspare) {
 		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
 		    "try booting from '%s'",
 		    bvd->vdev_parent->
 		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	error = 0;
 out:
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	nvlist_free(config);
 	return (error);
 }
 
 #else
 
 extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
     uint64_t *count);
 
 static nvlist_t *
 spa_generate_rootconf(const char *name)
 {
 	nvlist_t **configs, **tops;
 	nvlist_t *config;
 	nvlist_t *best_cfg, *nvtop, *nvroot;
 	uint64_t *holes;
 	uint64_t best_txg;
 	uint64_t nchildren;
 	uint64_t pgid;
 	uint64_t count;
 	uint64_t i;
 	uint_t   nholes;
 
 	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
 		return (NULL);
 
 	ASSERT3U(count, !=, 0);
 	best_txg = 0;
 	for (i = 0; i < count; i++) {
 		uint64_t txg;
 
 		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
 		    &txg) == 0);
 		if (txg > best_txg) {
 			best_txg = txg;
 			best_cfg = configs[i];
 		}
 	}
 
 	nchildren = 1;
 	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
 	holes = NULL;
 	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
 	    &holes, &nholes);
 
 	tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
 	for (i = 0; i < nchildren; i++) {
 		if (i >= count)
 			break;
 		if (configs[i] == NULL)
 			continue;
 		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
 		    &nvtop) == 0);
 		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
 	}
 	for (i = 0; holes != NULL && i < nholes; i++) {
 		if (i >= nchildren)
 			continue;
 		if (tops[holes[i]] != NULL)
 			continue;
 		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
 		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
 		    VDEV_TYPE_HOLE) == 0);
 		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
 		    holes[i]) == 0);
 		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
 		    0) == 0);
 	}
 	for (i = 0; i < nchildren; i++) {
 		if (tops[i] != NULL)
 			continue;
 		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
 		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
 		    VDEV_TYPE_MISSING) == 0);
 		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
 		    i) == 0);
 		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
 		    0) == 0);
 	}
 
 	/*
 	 * Create pool config based on the best vdev config.
 	 */
 	nvlist_dup(best_cfg, &config, KM_SLEEP);
 
 	/*
 	 * Put this pool's top-level vdevs into a root vdev.
 	 */
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &pgid) == 0);
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    tops, nchildren) == 0);
 
 	/*
 	 * Replace the existing vdev_tree with the new root vdev in
 	 * this pool's configuration (remove the old, add the new).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 
 	/*
 	 * Drop vdev config elements that should not be present at pool level.
 	 */
 	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
 	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
 
 	for (i = 0; i < count; i++)
 		nvlist_free(configs[i]);
 	kmem_free(configs, count * sizeof(void *));
 	for (i = 0; i < nchildren; i++)
 		nvlist_free(tops[i]);
 	kmem_free(tops, nchildren * sizeof(void *));
 	nvlist_free(nvroot);
 	return (config);
 }
 
 int
 spa_import_rootpool(const char *name)
 {
 	spa_t *spa;
 	vdev_t *rvd, *bvd, *avd = NULL;
 	nvlist_t *config, *nvtop;
 	uint64_t txg;
 	char *pname;
 	int error;
 
 	/*
 	 * Read the label from the boot device and generate a configuration.
 	 */
 	config = spa_generate_rootconf(name);
 
 	mutex_enter(&spa_namespace_lock);
 	if (config != NULL) {
 		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    &pname) == 0 && strcmp(name, pname) == 0);
 		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
 		    == 0);
 
 		if ((spa = spa_lookup(pname)) != NULL) {
 			/*
 			 * Remove the existing root pool from the namespace so
 			 * that we can replace it with the correct config
 			 * we just read in.
 			 */
 			spa_remove(spa);
 		}
 		spa = spa_add(pname, config, NULL);
 
 		/*
 		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
 		 * via spa_version().
 		 */
 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 		    &spa->spa_ubsync.ub_version) != 0)
 			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 	} else if ((spa = spa_lookup(name)) == NULL) {
 		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
 		    name);
 		return (EIO);
 	} else {
 		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
 	}
 	spa->spa_is_root = B_TRUE;
 	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
 	/*
 	 * Build up a vdev tree based on the boot device's label config.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
 	    VDEV_ALLOC_ROOTPOOL);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error) {
 		mutex_exit(&spa_namespace_lock);
 		nvlist_free(config);
 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
 		    pname);
 		return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	nvlist_free(config);
 	return (0);
 }
 
 #endif	/* sun */
 #endif
 
 /*
  * Import a non-root pool into the system.
  */
 int
 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_rewind_policy_t policy;
 	uint64_t mode = spa_mode_global;
 	uint64_t readonly = B_FALSE;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	/*
 	 * If a pool with this name exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	(void) nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
 	if (readonly)
 		mode = FREAD;
 	spa = spa_add(pool, config, altroot);
 	spa->spa_import_flags = flags;
 
 	/*
 	 * Verbatim import - Take a pool and insert it into the namespace
 	 * as if it had been loaded at boot.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
 		if (props != NULL)
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 
 		mutex_exit(&spa_namespace_lock);
 		return (0);
 	}
 
 	spa_activate(spa, mode);
 
 	/*
 	 * Don't start async tasks until we know everything is healthy.
 	 */
 	spa_async_suspend(spa);
 
 	zpool_get_rewind_policy(config, &policy);
 	if (policy.zrp_request & ZPOOL_DO_REWIND)
 		state = SPA_LOAD_RECOVER;
 
 	/*
 	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
 	 * because the user-supplied config is actually the one to trust when
 	 * doing an import.
 	 */
 	if (state != SPA_LOAD_RECOVER)
 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 
 	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
 	    policy.zrp_request);
 
 	/*
 	 * Propagate anything learned while loading the pool and pass it
 	 * back to caller (i.e. rewind info, missing devices, etc).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 	    spa->spa_load_info) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
 	 * Toss any existing sparelist, as it doesn't have any validity
 	 * anymore, and conflicts with spa_has_spare().
 	 */
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 		spa_load_spares(spa);
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 		spa_load_l2cache(spa);
 	}
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 	if (error == 0)
 		error = spa_validate_aux(spa, nvroot, -1ULL,
 		    VDEV_ALLOC_SPARE);
 	if (error == 0)
 		error = spa_validate_aux(spa, nvroot, -1ULL,
 		    VDEV_ALLOC_L2CACHE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
 	if (error != 0 || (props && spa_writeable(spa) &&
 	    (error = spa_prop_set(spa, props)))) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	spa_async_resume(spa);
 
 	/*
 	 * Override any spares and level 2 cache devices as specified by
 	 * the user, as these may have correct device names/devids, etc.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		if (spa->spa_spares.sav_config)
 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		if (spa->spa_l2cache.sav_config)
 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Check for any removed devices.
 	 */
 	if (spa->spa_autoreplace) {
 		spa_aux_check_removed(&spa->spa_spares);
 		spa_aux_check_removed(&spa->spa_l2cache);
 	}
 
 	if (spa_writeable(spa)) {
 		/*
 		 * Update the config cache to include the newly-imported pool.
 		 */
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	}
 
 	/*
 	 * It's possible that the pool was expanded while it was exported.
 	 * We kick off an async task to handle this for us.
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
 	mutex_exit(&spa_namespace_lock);
 	spa_history_log_version(spa, "import");
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	zvol_create_minors(pool);
 #endif
 #endif
 	return (0);
 }
 
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
 	char *poolname;
 	spa_t *spa;
 	uint64_t state;
 	int error;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
 
 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 		return (NULL);
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 	spa_activate(spa, FREAD);
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
 	 * Pass TRUE for mosconfig because the user-supplied config
 	 * is actually the one to trust when doing an import.
 	 */
 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    poolname) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    state) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp) == 0);
 		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 
 		/*
 		 * If the bootfs property exists on this pool then we
 		 * copy it out so that external consumers can tell which
 		 * pools are bootable.
 		 */
 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 			/*
 			 * We have to play games with the name since the
 			 * pool was opened as TRYIMPORT_NAME.
 			 */
 			if (dsl_dsobj_to_dsname(spa_name(spa),
 			    spa->spa_bootfs, tmpname) == 0) {
 				char *cp;
 				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 				cp = strchr(tmpname, '/');
 				if (cp == NULL) {
 					(void) strlcpy(dsname, tmpname,
 					    MAXPATHLEN);
 				} else {
 					(void) snprintf(dsname, MAXPATHLEN,
 					    "%s/%s", poolname, ++cp);
 				}
 				VERIFY(nvlist_add_string(config,
 				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
 				kmem_free(dsname, MAXPATHLEN);
 			}
 			kmem_free(tmpname, MAXPATHLEN);
 		}
 
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_remove(spa);
 	mutex_exit(&spa_namespace_lock);
 
 	return (config);
 }
 
 /*
  * Pool export/destroy
  *
  * The act of destroying or exporting a pool is very simple.  We make sure there
  * is no more pending I/O and any references to the pool are gone.  Then, we
  * update the pool state and sync all the labels to disk, removing the
  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
  * we don't sync the labels or remove the configuration cache.
  */
 static int
 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
     boolean_t force, boolean_t hardforce)
 {
 	spa_t *spa;
 
 	if (oldconfig)
 		*oldconfig = NULL;
 
 	if (!(spa_mode_global & FWRITE))
 		return (SET_ERROR(EROFS));
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pool)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	/*
 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
 	 * reacquire the namespace lock, and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	spa_async_suspend(spa);
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 
 	/*
 	 * The pool will be in core if it's openable,
 	 * in which case we can modify its state.
 	 */
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
 		/*
 		 * Objsets may be open only because they're dirty, so we
 		 * have to force it to sync before checking spa_refcnt.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 		/*
 		 * A pool cannot be exported or destroyed if there are active
 		 * references.  If we are resetting a pool, allow references by
 		 * fault injection handlers.
 		 */
 		if (!spa_refcount_zero(spa) ||
 		    (spa->spa_inject_ref != 0 &&
 		    new_state != POOL_STATE_UNINITIALIZED)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(EBUSY));
 		}
 
 		/*
 		 * A pool cannot be exported if it has an active shared spare.
 		 * This is to prevent other pools stealing the active spare
 		 * from an exported pool. At user's own will, such pool can
 		 * be forcedly exported.
 		 */
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(EXDEV));
 		}
 
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			spa->spa_final_txg = spa_last_synced_txg(spa) +
 			    TXG_DEFER_SIZE + 1;
 			vdev_config_dirty(spa->spa_root_vdev);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 	}
 
 	if (oldconfig && spa->spa_config)
 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
 
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
 			spa_config_sync(spa, B_TRUE, B_TRUE);
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Destroy a storage pool.
  */
 int
 spa_destroy(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce)
 {
 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
 	    force, hardforce));
 }
 
 /*
  * Similar to spa_export(), this unloads the spa_t without actually removing it
  * from the namespace in any way.
  */
 int
 spa_reset(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
  */
 
 /*
  * Add a device to a storage pool.
  */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg, id;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
 	    &nspares) != 0)
 		nspares = 0;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
 	    &nl2cache) != 0)
 		nl2cache = 0;
 
 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 	if (vd->vdev_children != 0 &&
 	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * We must validate the spares and l2cache devices after checking the
 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 	 */
 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * Transfer each new top-level vdev from vd to rvd.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 
 		/*
 		 * Set the vdev id to the first hole, if one exists.
 		 */
 		for (id = 0; id < rvd->vdev_children; id++) {
 			if (rvd->vdev_child[id]->vdev_ishole) {
 				vdev_free(rvd->vdev_child[id]);
 				break;
 			}
 		}
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = id;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (nspares != 0) {
 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
 		    ZPOOL_CONFIG_SPARES);
 		spa_load_spares(spa);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	if (nl2cache != 0) {
 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
 		    ZPOOL_CONFIG_L2CACHE);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * We have to be careful when adding new vdevs to an existing pool.
 	 * If other threads start allocating from these vdevs before we
 	 * sync the config cache, and we lose power, then upon reboot we may
 	 * fail to open the pool because there are DVAs that the config cache
 	 * can't translate.  Therefore, we first add the vdevs without
 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
 	 * and then let spa_config_update() initialize the new metaslabs.
 	 *
 	 * spa_load() checks for added-but-not-initialized vdevs, so that
 	 * if we lose power at any point in this sequence, the remaining
 	 * steps will be completed the next time we load the pool.
 	 */
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Attach a device to a mirror.  The arguments are the path to any device
  * in the mirror, and the nvroot for the new device.  If the path specifies
  * a device that is not mirrored, we automatically insert the mirror vdev.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
  * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
  */
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
 	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare;
 	int error;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!oldvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ATTACH)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
 
 	if (!newvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
 	 * Spares can't replace logs
 	 */
 	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 	if (!replacing) {
 		/*
 		 * For attach, the only allowable parent is a mirror or the root
 		 * vdev.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
 		    pvd->vdev_ops != &vdev_root_ops)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		pvops = &vdev_mirror_ops;
 	} else {
 		/*
 		 * Active hot spares can only be replaced by inactive hot
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
 		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * If the source is a hot spare, and the parent isn't already a
 		 * spare, then we want to create a new hot spare.  Otherwise, we
 		 * want to create a replacing vdev.  The user is not allowed to
 		 * attach to a spared vdev child unless the 'isspare' state is
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
 		if (pvd->vdev_ops == &vdev_replacing_ops &&
 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 
 		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
 	}
 
 	/*
 	 * Make sure the new device is big enough.
 	 */
 	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
 
 	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
 		spa_strfree(oldvd->vdev_path);
 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
 		    KM_SLEEP);
 		(void) sprintf(oldvd->vdev_path, "%s/%s",
 		    newvd->vdev_path, "old");
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
 	}
 
 	/* mark the device being resilvered */
 	newvd->vdev_resilver_txg = txg;
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
 	if (pvd->vdev_ops != pvops)
 		pvd = vdev_add_parent(oldvd, pvops);
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
 	ASSERT(pvd->vdev_ops == pvops);
 	ASSERT(oldvd->vdev_parent == pvd);
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
 	 * for any dmu_sync-ed blocks.  It will propagate upward when
 	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
 	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
 	    dtl_max_txg - TXG_INITIAL);
 
 	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
 		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
 	}
 
 	oldvdpath = spa_strdup(oldvd->vdev_path);
 	newvdpath = spa_strdup(newvd->vdev_path);
 	newvd_isspare = newvd->vdev_isspare;
 
 	/*
 	 * Mark newvd's DTL dirty in this txg.
 	 */
 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 	/*
 	 * Schedule the resilver to restart in the future. We do this to
 	 * ensure that dmu_sync-ed blocks have been stitched into the
 	 * respective datasets.
 	 */
 	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
 
 	/*
 	 * Commit the config
 	 */
 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
 	spa_history_log_internal(spa, "vdev attach", NULL,
 	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
 
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 
 	return (0);
 }
 
 /*
  * Detach a device from a mirror or replacing vdev.
  *
  * If 'replace_done' is specified, only detach if the parent
  * is a replacing vdev.
  */
 int
 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid = 0;
 	char *vdpath;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = vd->vdev_parent;
 
 	/*
 	 * If the parent/child relationship is not as expected, don't do it.
 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
 	 * vdev that's replacing B with C.  The user's intent in replacing
 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
 	 * the replace by detaching C, the expected behavior is to end up
 	 * M(A,B).  But suppose that right after deciding to detach C,
 	 * the replacement of B completes.  We would have M(A,C), and then
 	 * ask to detach C, which would leave us with just A -- not what
 	 * the user wanted.  To prevent this, we make sure that the
 	 * parent/child relationship hasn't changed -- in this example,
 	 * that C's parent is still the replacing vdev R.
 	 */
 	if (pvd->vdev_guid != pguid && pguid != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
 	 * Only 'replacing' or 'spare' vdevs can be replaced.
 	 */
 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
 
 	/*
 	 * Only mirror, replacing, and spare vdevs support detach.
 	 */
 	if (pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
 	 * If this device has the only valid copy of some data,
 	 * we cannot safely detach it.
 	 */
 	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
 	    vd->vdev_path != NULL) {
 		size_t len = strlen(vd->vdev_path);
 
 		for (int c = 0; c < pvd->vdev_children; c++) {
 			cvd = pvd->vdev_child[c];
 
 			if (cvd == vd || cvd->vdev_path == NULL)
 				continue;
 
 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
 				spa_strfree(cvd->vdev_path);
 				cvd->vdev_path = spa_strdup(vd->vdev_path);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we are detaching the original disk from a spare, then it implies
 	 * that the spare should become a real disk, and be removed from the
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_id == 0 &&
 	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
 	 * Erase the disk labels so the disk can be used for other things.
 	 * This must be done after all other error cases are handled,
 	 * but before we disembowel vd (so we can still do I/O to it).
 	 * But if we can't do it, don't treat the error as fatal --
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
 	 */
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
 	 * do it now, marking the vdev as no longer a spare in the process.
 	 * We must do this before vdev_remove_parent(), because that can
 	 * change the GUID if it creates a new toplevel GUID.  For a similar
 	 * reason, we must remove the spare now, in the same txg as the detach;
 	 * otherwise someone could attach a new sibling, change the GUID, and
 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
 	if (pvd->vdev_children == 1) {
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
 	}
 
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
 	 * may have been the previous top-level vdev.
 	 */
 	tvd = cvd->vdev_top;
 	ASSERT(tvd->vdev_parent == rvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(cvd);
 
 	/*
 	 * If the 'autoexpand' property is set on the pool then automatically
 	 * try to expand the size of the pool. For example if the device we
 	 * just detached was smaller than the others, it may be possible to
 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
 	 * first so that we can obtain the updated sizes of the leaf vdevs.
 	 */
 	if (spa->spa_autoexpand) {
 		vdev_reopen(tvd);
 		vdev_expand(tvd, txg);
 	}
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
 	vdpath = spa_strdup(vd->vdev_path);
 	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
 
 	/* hang on to the spa before we release the lock */
 	spa_open_ref(spa, FTAG);
 
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_history_log_internal(spa, "detach", NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
 	/*
 	 * If this was the removal of the original device in a hot spare vdev,
 	 * then we want to go through and remove the device from the hot spare
 	 * list of every other pool.
 	 */
 	if (unspare) {
 		spa_t *altspa = NULL;
 
 		mutex_enter(&spa_namespace_lock);
 		while ((altspa = spa_next(altspa)) != NULL) {
 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
 			    altspa == spa)
 				continue;
 
 			spa_open_ref(altspa, FTAG);
 			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			mutex_enter(&spa_namespace_lock);
 			spa_close(altspa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
 
 		/* search the rest of the vdevs for spares to remove */
 		spa_vdev_resilver_done(spa);
 	}
 
 	/* all done with the spa; OK to release */
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
 
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
 int
 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp)
 {
 	int error = 0;
 	uint64_t txg, *glist;
 	spa_t *newspa;
 	uint_t c, children, lastlog;
 	nvlist_t **child, *nvl, *tmp;
 	dmu_tx_t *tx;
 	char *altroot = NULL;
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 	error = spa_offline_log(spa);
 	txg = spa_vdev_config_enter(spa);
 
 	if (activate_slog)
 		spa_activate_log(spa);
 
 	if (error != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	/* check new spa name before going any further */
 	if (spa_lookup(newname) != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
 
 	/*
 	 * scan through all the children to ensure they're all mirrors
 	 */
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* first, check to ensure we've got the right child count */
 	rvd = spa->spa_root_vdev;
 	lastlog = 0;
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		/* don't count the holes & logs as children */
 		if (vd->vdev_islog || vd->vdev_ishole) {
 			if (lastlog == 0)
 				lastlog = c;
 			continue;
 		}
 
 		lastlog = 0;
 	}
 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* next, ensure no spare or cache devices are part of the split */
 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
 
 	/* then, loop over each vdev and validate it */
 	for (c = 0; c < children; c++) {
 		uint64_t is_hole = 0;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 		    &is_hole);
 
 		if (is_hole != 0) {
 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 				continue;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 		}
 
 		/* which disk is going to be split? */
 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 		    &glist[c]) != 0) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		/* look it up in the spa */
 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 		if (vml[c] == NULL) {
 			error = SET_ERROR(ENODEV);
 			break;
 		}
 
 		/* make sure there's nothing stopping the split */
 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
 		    vml[c]->vdev_islog ||
 		    vml[c]->vdev_ishole ||
 		    vml[c]->vdev_isspare ||
 		    vml[c]->vdev_isl2cache ||
 		    !vdev_writeable(vml[c]) ||
 		    vml[c]->vdev_children != 0 ||
 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (vdev_dtl_required(vml[c])) {
 			error = SET_ERROR(EBUSY);
 			break;
 		}
 
 		/* we need certain info from the top level */
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vml[c]->vdev_top->vdev_ms_array) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vml[c]->vdev_top->vdev_ms_shift) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
 		    vml[c]->vdev_top->vdev_asize) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 		    vml[c]->vdev_top->vdev_ashift) == 0);
 	}
 
 	if (error != 0) {
 		kmem_free(vml, children * sizeof (vdev_t *));
 		kmem_free(glist, children * sizeof (uint64_t));
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* stop writers from using the disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_TRUE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	/*
 	 * Temporarily record the splitting vdevs in the spa config.  This
 	 * will disappear once the config is regenerated.
 	 */
 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    glist, children) == 0);
 	kmem_free(glist, children * sizeof (uint64_t));
 
 	mutex_enter(&spa->spa_props_lock);
 	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
 	    nvl) == 0);
 	mutex_exit(&spa->spa_props_lock);
 	spa->spa_config_splitting = nvl;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	/* configure and create the new pool */
 	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 	    spa_version(spa)) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    spa->spa_config_txg) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_generate_guid(NULL)) == 0);
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
 	/* add the new pool to the namespace */
 	newspa = spa_add(newname, config, altroot);
 	newspa->spa_config_txg = spa->spa_config_txg;
 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
 	/* release the spa config lock, retaining the namespace lock */
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 1);
 
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
 #ifndef sun
 	/* mark that we are creating new spa by splitting */
 	newspa->spa_splitting_newspa = B_TRUE;
 #endif
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
 #ifndef sun
 	newspa->spa_splitting_newspa = B_FALSE;
 #endif
 	if (error)
 		goto out;
 
 	/* if that worked, generate a real config for the new pool */
 	if (newspa->spa_root_vdev != NULL) {
 		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
 		    B_TRUE));
 	}
 
 	/* set the props */
 	if (props != NULL) {
 		spa_configfile_set(newspa, props, B_FALSE);
 		error = spa_prop_set(newspa, props);
 		if (error)
 			goto out;
 	}
 
 	/* flush everything */
 	txg = spa_vdev_config_enter(newspa);
 	vdev_config_dirty(newspa->spa_root_vdev);
 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 2);
 
 	spa_async_resume(newspa);
 
 	/* finally, update the original pool's config */
 	txg = spa_vdev_config_enter(spa);
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0)
 		dmu_tx_abort(tx);
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL) {
 			vdev_split(vml[c]);
 			if (error == 0)
 				spa_history_log_internal(spa, "detach", tx,
 				    "vdev=%s", vml[c]->vdev_path);
 			vdev_free(vml[c]);
 		}
 	}
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa->spa_config_splitting = NULL;
 	nvlist_free(nvl);
 	if (error == 0)
 		dmu_tx_commit(tx);
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
 	spa_history_log_internal(newspa, "split", NULL,
 	    "from pool %s", spa_name(spa));
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 
 	/* if we're not going to mount the filesystems in userland, export */
 	if (exp)
 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
 		    B_FALSE, B_FALSE);
 
 	return (error);
 
 out:
 	spa_unload(newspa);
 	spa_deactivate(newspa);
 	spa_remove(newspa);
 
 	txg = spa_vdev_config_enter(spa);
 
 	/* re-online all offlined disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
 	spa->spa_config_splitting = NULL;
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 	return (error);
 }
 
 static nvlist_t *
 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
 {
 	for (int i = 0; i < count; i++) {
 		uint64_t guid;
 
 		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
 		    &guid) == 0);
 
 		if (guid == target_guid)
 			return (nvpp[i]);
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
 	nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
 	if (count > 1)
 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
 
 	for (int i = 0, j = 0; i < count; i++) {
 		if (dev[i] == dev_to_remove)
 			continue;
 		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
 	}
 
 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
 	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
 
 	for (int i = 0; i < count - 1; i++)
 		nvlist_free(newdev[i]);
 
 	if (count > 1)
 		kmem_free(newdev, (count - 1) * sizeof (void *));
 }
 
 /*
  * Evacuate the device.
  */
 static int
 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 {
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Evacuate the device.  We don't hold the config lock as writer
 	 * since we need to do I/O but we do keep the
 	 * spa_namespace_lock held.  Once this completes the device
 	 * should no longer have any blocks allocated on it.
 	 */
 	if (vd->vdev_islog) {
 		if (vd->vdev_stat.vs_alloc != 0)
 			error = spa_offline_log(spa);
 	} else {
 		error = SET_ERROR(ENOTSUP);
 	}
 
 	if (error)
 		return (error);
 
 	/*
 	 * The evacuation succeeded.  Remove any remaining MOS metadata
 	 * associated with this vdev, and wait for these changes to sync.
 	 */
 	ASSERT0(vd->vdev_stat.vs_alloc);
 	txg = spa_vdev_config_enter(spa);
 	vd->vdev_removing = B_TRUE;
 	vdev_dirty_leaves(vd, VDD_DTL, txg);
 	vdev_config_dirty(vd);
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	return (0);
 }
 
 /*
  * Complete the removal by cleaning up the namespace.
  */
 static void
 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t id = vd->vdev_id;
 	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Only remove any devices which are empty.
 	 */
 	if (vd->vdev_stat.vs_alloc != 0)
 		return;
 
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	if (list_link_active(&vd->vdev_state_dirty_node))
 		vdev_state_clean(vd);
 	if (list_link_active(&vd->vdev_config_dirty_node))
 		vdev_config_clean(vd);
 
 	vdev_free(vd);
 
 	if (last_vdev) {
 		vdev_compact_children(rvd);
 	} else {
 		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
 		vdev_add_child(rvd, vd);
 	}
 	vdev_config_dirty(rvd);
 
 	/*
 	 * Reassess the health of our root vdev.
 	 */
 	vdev_reopen(rvd);
 }
 
 /*
  * Remove a device from the pool -
  *
  * Removing a device from the vdev namespace requires several steps
  * and can take a significant amount of time.  As a result we use
  * the spa_vdev_config_[enter/exit] functions which allow us to
  * grab and release the spa_config_lock while still holding the namespace
  * lock.  During each step the configuration is synced out.
  *
  * Currently, this supports removing only hot spares, slogs, and level 2 ARC
  * devices.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
 	metaslab_group_t *mg;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
 	int error = 0;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
 	ASSERT(spa_writeable(spa));
 
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (spa->spa_spares.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
 		/*
 		 * Only remove the hot spare if it's not currently in use
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 			spa_load_spares(spa);
 			spa->spa_spares.sav_sync = B_TRUE;
 		} else {
 			error = SET_ERROR(EBUSY);
 		}
 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
 		/*
 		 * Cache devices can always be removed.
 		 */
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	} else if (vd != NULL && vd->vdev_islog) {
 		ASSERT(!locked);
 		ASSERT(vd == vd->vdev_top);
 
 		mg = vd->vdev_mg;
 
 		/*
 		 * Stop allocating from this vdev.
 		 */
 		metaslab_group_passivate(mg);
 
 		/*
 		 * Wait for the youngest allocations and frees to sync,
 		 * and then wait for the deferral of those frees to finish.
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 		/*
 		 * Attempt to evacuate the vdev.
 		 */
 		error = spa_vdev_remove_evacuate(spa, vd);
 
 		txg = spa_vdev_config_enter(spa);
 
 		/*
 		 * If we couldn't evacuate the vdev, unwind.
 		 */
 		if (error) {
 			metaslab_group_activate(mg);
 			return (spa_vdev_exit(spa, NULL, txg, error));
 		}
 
 		/*
 		 * Clean up the vdev namespace.
 		 */
 		spa_vdev_remove_from_namespace(spa, vd);
 
 	} else if (vd != NULL) {
 		/*
 		 * Normal vdevs cannot be removed (yet).
 		 */
 		error = SET_ERROR(ENOTSUP);
 	} else {
 		/*
 		 * There is no vdev of any kind with the specified guid.
 		 */
 		error = SET_ERROR(ENOENT);
 	}
 
 	if (!locked)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	return (error);
 }
 
 /*
  * Find any device that's done replacing, or a vdev marked 'unspare' that's
  * currently spared, so we can detach it.
  */
 static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed replacement.  We always consider the first
 	 * vdev in the list to be the oldest vdev, and the last one to be
 	 * the newest (see spa_vdev_attach() for how that works).  In
 	 * the case where the newest vdev is faulted, we will not automatically
 	 * remove it after a resilver completes.  This is OK as it will require
 	 * user intervention to determine which disk the admin wishes to keep.
 	 */
 	if (vd->vdev_ops == &vdev_replacing_ops) {
 		ASSERT(vd->vdev_children > 1);
 
 		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 */
 	if (vd->vdev_ops == &vdev_spare_ops) {
 		vdev_t *first = vd->vdev_child[0];
 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
 
 		if (last->vdev_unspare) {
 			oldvd = first;
 			newvd = last;
 		} else if (first->vdev_unspare) {
 			oldvd = last;
 			newvd = first;
 		} else {
 			oldvd = NULL;
 		}
 
 		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 
 		/*
 		 * If there are more than two spares attached to a disk,
 		 * and those spares are not required, then we want to
 		 * attempt to free them up now so that they can be used
 		 * by other pools.  Once we're back down to a single
 		 * disk+spare, we stop removing them.
 		 */
 		if (vd->vdev_children > 2) {
 			newvd = vd->vdev_child[1];
 
 			if (newvd->vdev_isspare && last->vdev_isspare &&
 			    vdev_dtl_empty(last, DTL_MISSING) &&
 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
 			    !vdev_dtl_required(newvd))
 				return (newvd);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd, *pvd, *ppvd;
 	uint64_t guid, sguid, pguid, ppguid;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		pvd = vd->vdev_parent;
 		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
 		pguid = pvd->vdev_guid;
 		ppguid = ppvd->vdev_guid;
 		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
 		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
 		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
 
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * Update the stored path or FRU for this vdev.
  */
 int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
 	vdev_t *vd;
 	boolean_t sync = B_FALSE;
 
 	ASSERT(spa_writeable(spa));
 
 	spa_vdev_state_enter(spa, SCL_ALL);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	if (ispath) {
 		if (strcmp(value, vd->vdev_path) != 0) {
 			spa_strfree(vd->vdev_path);
 			vd->vdev_path = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	} else {
 		if (vd->vdev_fru == NULL) {
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		} else if (strcmp(value, vd->vdev_fru) != 0) {
 			spa_strfree(vd->vdev_fru);
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	}
 
 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 {
 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
 }
 
 int
 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 {
 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * SPA Scanning
  * ==========================================================================
  */
 
 int
 spa_scan_stop(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
 int
 spa_scan(spa_t *spa, pool_scan_func_t func)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
 	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
 	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
 /*
  * ==========================================================================
  * SPA async task processing
  * ==========================================================================
  */
 
 static void
 spa_async_remove(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = B_FALSE;
 		vd->vdev_delayed_close = B_FALSE;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 
 		/*
 		 * We want to clear the stats, but we don't want to do a full
 		 * vdev_clear() as that will cause us to throw away
 		 * degraded/faulted state as well as attempt to reopen the
 		 * device, all of which is a waste.
 		 */
 		vd->vdev_stat.vs_read_errors = 0;
 		vd->vdev_stat.vs_write_errors = 0;
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_remove(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_probe(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_probe_wanted) {
 		vd->vdev_probe_wanted = B_FALSE;
 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_probe(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 {
 	sysevent_id_t eid;
 	nvlist_t *attr;
 	char *physpath;
 
 	if (!spa->spa_autoexpand)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		spa_async_autoexpand(spa, cvd);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
 		return;
 
 	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
 
 	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 
 	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 	    ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
 
 	nvlist_free(attr);
 	kmem_free(physpath, MAXPATHLEN);
 }
 
 static void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = arg;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 		uint64_t old_space, new_space;
 
 		mutex_enter(&spa_namespace_lock);
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
 		 * If the pool grew as a result of the config update,
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
 			spa_history_log_internal(spa, "vdev online", NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), new_space, new_space - old_space);
 		}
 	}
 
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	/*
 	 * See if any devices need to be probed.
 	 */
 	if (tasks & SPA_ASYNC_PROBE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_probe(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER_DONE)
 		spa_vdev_resilver_done(spa);
 
 	/*
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER)
 		dsl_resilver_restart(spa->spa_dsl_pool, 0);
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_thread = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 static void
 spa_async_thread_vd(void *arg)
 {
 	spa_t *spa = arg;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 retry:
 	spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & SPA_ASYNC_REMOVE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	if ((tasks & SPA_ASYNC_REMOVE) != 0)
 		goto retry;
 	spa->spa_async_thread_vd = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
 	while (spa->spa_async_thread != NULL &&
 	    spa->spa_async_thread_vd != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	ASSERT(spa->spa_async_suspended != 0);
 	spa->spa_async_suspended--;
 	mutex_exit(&spa->spa_async_lock);
 }
 
 static boolean_t
 spa_async_tasks_pending(spa_t *spa)
 {
 	uint_t non_config_tasks;
 	uint_t config_task;
 	boolean_t config_task_suspended;
 
 	non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
 	    SPA_ASYNC_REMOVE);
 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
 	if (spa->spa_ccw_fail_time == 0) {
 		config_task_suspended = B_FALSE;
 	} else {
 		config_task_suspended =
 		    (gethrtime() - spa->spa_ccw_fail_time) <
 		    (zfs_ccw_retry_interval * NANOSEC);
 	}
 
 	return (non_config_tasks || (config_task && !config_task_suspended));
 }
 
 static void
 spa_async_dispatch(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if (spa_async_tasks_pending(spa) &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread == NULL &&
 	    rootdir != NULL)
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 static void
 spa_async_dispatch_vd(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread_vd == NULL &&
 	    rootdir != NULL)
 		spa->spa_async_thread_vd = thread_create(NULL, 0,
 		    spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_request(spa_t *spa, int task)
 {
 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
 	spa_async_dispatch_vd(spa);
 }
 
 /*
  * ==========================================================================
  * SPA syncing routines
  * ==========================================================================
  */
 
 static int
 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	bpobj_t *bpo = arg;
 	bpobj_enqueue(bpo, bp, tx);
 	return (0);
 }
 
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zio_t *zio = arg;
 
 	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
 	    BP_GET_PSIZE(bp), zio->io_flags));
 	return (0);
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing frees.
  */
 static void
 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
 	VERIFY(zio_wait(zio) == 0);
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing deferred frees.
  */
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 	    spa_free_sync_cb, zio, tx), ==, 0);
 	VERIFY0(zio_wait(zio));
 }
 
 
 static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
 	size_t bufsize;
 	size_t nvsize = 0;
 	dmu_buf_t *db;
 
 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
 	 * information.  This avoids the dmu_buf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = kmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
 	bzero(packed + nvsize, bufsize - nvsize);
 
 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 
 	kmem_free(packed, bufsize);
 
 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
 	dmu_buf_rele(db, FTAG);
 }
 
 static void
 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
     const char *config, const char *entry)
 {
 	nvlist_t *nvroot;
 	nvlist_t **list;
 	int i;
 
 	if (!sav->sav_sync)
 		return;
 
 	/*
 	 * Update the MOS nvlist describing the list of available devices.
 	 * spa_validate_aux() will have already made sure this nvlist is
 	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (sav->sav_object == 0) {
 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
 		    sizeof (uint64_t), tx);
 		VERIFY(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
 		    &sav->sav_object, tx) == 0);
 	}
 
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	if (sav->sav_count == 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
 	} else {
 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
 		    sav->sav_count) == 0);
 		for (i = 0; i < sav->sav_count; i++)
 			nvlist_free(list[i]);
 		kmem_free(list, sav->sav_count * sizeof (void *));
 	}
 
 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 	nvlist_free(nvroot);
 
 	sav->sav_sync = B_FALSE;
 }
 
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
 	if (list_is_empty(&spa->spa_config_dirty_list))
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
 	/*
 	 * If we're upgrading the spa version then make sure that
 	 * the config object gets updated with the correct version.
 	 */
 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 		    spa->spa_uberblock.ub_version);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (spa->spa_config_syncing)
 		nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
 static void
 spa_sync_version(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *versionp = arg;
 	uint64_t version = *versionp;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	/*
 	 * Setting the version is special cased when first creating the pool.
 	 */
 	ASSERT(tx->tx_txg != TXG_INITIAL);
 
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	ASSERT(version >= spa_version(spa));
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_history_log_internal(spa, "set", tx, "version=%lld", version);
 }
 
 /*
  * Set zpool properties.
  */
 static void
 spa_sync_props(void *arg, dmu_tx_t *tx)
 {
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
 		uint64_t intval;
 		char *strval, *fname;
 		zpool_prop_t prop;
 		const char *propname;
 		zprop_type_t proptype;
 		spa_feature_t fid;
 
 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
 		case ZPROP_INVAL:
 			/*
 			 * We checked this earlier in spa_prop_validate().
 			 */
 			ASSERT(zpool_prop_feature(nvpair_name(elem)));
 
 			fname = strchr(nvpair_name(elem), '@') + 1;
 			VERIFY0(zfeature_lookup_name(fname, &fid));
 
 			spa_feature_enable(spa, fid, tx);
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=enabled", nvpair_name(elem));
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			intval = fnvpair_value_uint64(elem);
 			/*
 			 * The version is synced seperatly before other
 			 * properties and should be correct by now.
 			 */
 			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
 			/*
 			 * 'altroot' is a non-persistent property. It should
 			 * have been set temporarily at creation or import time.
 			 */
 			ASSERT(spa->spa_root != NULL);
 			break;
 
 		case ZPOOL_PROP_READONLY:
 		case ZPOOL_PROP_CACHEFILE:
 			/*
 			 * 'readonly' and 'cachefile' are also non-persisitent
 			 * properties.
 			 */
 			break;
 		case ZPOOL_PROP_COMMENT:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_comment != NULL)
 				spa_strfree(spa->spa_comment);
 			spa->spa_comment = spa_strdup(strval);
 			/*
 			 * We need to dirty the configuration on all the vdevs
 			 * so that their labels get updated.  It's unnecessary
 			 * to do this for pool creation since the vdev's
 			 * configuratoin has already been dirtied.
 			 */
 			if (tx->tx_txg != TXG_INITIAL)
 				vdev_config_dirty(spa->spa_root_vdev);
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", nvpair_name(elem), strval);
 			break;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
 				spa->spa_pool_props_object =
 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
 				    tx);
 			}
 
 			/* normalize the property name */
 			propname = zpool_prop_to_name(prop);
 			proptype = zpool_prop_get_type(prop);
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%s", nvpair_name(elem), strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(zpool_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    8, 1, &intval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%lld", nvpair_name(elem), intval);
 			} else {
 				ASSERT(0); /* not allowed */
 			}
 
 			switch (prop) {
 			case ZPOOL_PROP_DELEGATION:
 				spa->spa_delegation = intval;
 				break;
 			case ZPOOL_PROP_BOOTFS:
 				spa->spa_bootfs = intval;
 				break;
 			case ZPOOL_PROP_FAILUREMODE:
 				spa->spa_failmode = intval;
 				break;
 			case ZPOOL_PROP_AUTOEXPAND:
 				spa->spa_autoexpand = intval;
 				if (tx->tx_txg != TXG_INITIAL)
 					spa_async_request(spa,
 					    SPA_ASYNC_AUTOEXPAND);
 				break;
 			case ZPOOL_PROP_DEDUPDITTO:
 				spa->spa_dedup_ditto = intval;
 				break;
 			default:
 				break;
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
  * Perform one-time upgrade on-disk changes.  spa_version() does not
  * reflect the new version this txg, so there must be no changes this
  * txg to anything that the upgrade code depends on after it executes.
  * Therefore this must be called after dsl_pool_sync() does the sync
  * tasks.
  */
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	ASSERT(spa->spa_sync_pass == 1);
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
 		dsl_pool_create_origin(dp, tx);
 
 		/* Keeping the origin open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
 		dsl_pool_upgrade_clones(dp, tx);
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
 		dsl_pool_upgrade_dir_clones(dp, tx);
 
 		/* Keeping the freedir open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		spa_feature_create_zap_objects(spa, tx);
 	}
 
 	/*
 	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
 	 * when possibility to use lz4 compression for metadata was added
 	 * Old pools that have this feature enabled must be upgraded to have
 	 * this feature active
 	 */
 	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		boolean_t lz4_en = spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 		boolean_t lz4_ac = spa_feature_is_active(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 
 		if (lz4_en && !lz4_ac)
 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
 	}
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
 /*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int error;
 
 	VERIFY(spa_writeable(spa));
 
 	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	while (list_head(&spa->spa_state_dirty_list) != NULL) {
 		/*
 		 * We need the write lock here because, for aux vdevs,
 		 * calling vdev_config_dirty() modifies sav_config.
 		 * This is ugly and will become unnecessary when we
 		 * eliminate the aux vdev wart by integrating all vdevs
 		 * into the root vdev tree.
 		 */
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 		}
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	spa->spa_sync_starttime = gethrtime();
 #ifdef illumos
 	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
 	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
 #else	/* FreeBSD */
 #ifdef _KERNEL
 	callout_reset(&spa->spa_deadman_cycid,
 	    hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
 #endif
 #endif
 
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
 	 */
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		int i;
 
 		for (i = 0; i < rvd->vdev_children; i++) {
 			vd = rvd->vdev_child[i];
 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
 				break;
 		}
 		if (i == rvd->vdev_children) {
 			spa->spa_deflate = TRUE;
 			VERIFY(0 == zap_add(spa->spa_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
 		}
 	}
 
 	/*
 	 * Iterate to convergence.
 	 */
 	do {
 		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
 		if (pass < zfs_sync_pass_deferred_free) {
 			spa_sync_frees(spa, free_bpl, tx);
 		} else {
 			/*
 			 * We can not defer frees in pass 1, because
 			 * we sync the deferred frees later in pass 1.
 			 */
 			ASSERT3U(pass, >, 1);
 			bplist_iterate(free_bpl, bpobj_enqueue_cb,
 			    &spa->spa_deferred_bpobj, tx);
 		}
 
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
 
 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 			vdev_sync(vd, txg);
 
 		if (pass == 1) {
 			spa_sync_upgrades(spa, tx);
 			ASSERT3U(txg, >=,
 			    spa->spa_uberblock.ub_rootbp.blk_birth);
 			/*
 			 * Note: We need to check if the MOS is dirty
 			 * because we could have marked the MOS dirty
 			 * without updating the uberblock (e.g. if we
 			 * have sync tasks but no dirty user data).  We
 			 * need to check the uberblock's rootbp because
 			 * it is updated if we have synced out dirty
 			 * data (though in this case the MOS will most
 			 * likely also be dirty due to second order
 			 * effects, we don't want to rely on that here).
 			 */
 			if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
 			    !dmu_objset_is_dirty(mos, txg)) {
 				/*
 				 * Nothing changed on the first pass,
 				 * therefore this TXG is a no-op.  Avoid
 				 * syncing deferred frees, so that we
 				 * can keep this TXG as a no-op.
 				 */
 				ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
 				    txg));
 				ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 				ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
 				break;
 			}
 			spa_sync_deferred_frees(spa, tx);
 		}
 
 	} while (dmu_objset_is_dirty(mos, txg));
 
 	/*
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
 	 *
 	 * If there are no dirty vdevs, we sync the uberblock to a few
 	 * random top-level vdevs that are known to be visible in the
 	 * config cache (see spa_vdev_add() for a complete description).
 	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
 	 */
 	for (;;) {
 		/*
 		 * We hold SCL_STATE to prevent vdev open/close/etc.
 		 * while we're attempting to write the vdev labels.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
 			vdev_t *svd[SPA_DVAS_PER_BP];
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = spa_get_random(children);
 
 			for (int c = 0; c < children; c++) {
 				vd = rvd->vdev_child[(c0 + c) % children];
 				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
 					continue;
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_DVAS_PER_BP)
 					break;
 			}
 			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
 			if (error != 0)
 				error = vdev_config_sync(svd, svdcount, txg,
 				    B_TRUE);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
 			    rvd->vdev_children, txg, B_FALSE);
 			if (error != 0)
 				error = vdev_config_sync(rvd->vdev_child,
 				    rvd->vdev_children, txg, B_TRUE);
 		}
 
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
 			break;
 		zio_suspend(spa, NULL);
 		zio_resume_wait(spa);
 	}
 	dmu_tx_commit(tx);
 
 #ifdef illumos
 	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
 #else	/* FreeBSD */
 #ifdef _KERNEL
 	callout_drain(&spa->spa_deadman_cycid);
 #endif
 #endif
 
 	/*
 	 * Clear the dirty config list.
 	 */
 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 		vdev_config_clean(vd);
 
 	/*
 	 * Now that the new config has synced transactionally,
 	 * let it become visible to the config cache.
 	 */
 	if (spa->spa_config_syncing != NULL) {
 		spa_config_set(spa, spa->spa_config_syncing);
 		spa->spa_config_txg = txg;
 		spa->spa_config_syncing = NULL;
 	}
 
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	dsl_pool_sync_done(dp, txg);
 
 	/*
 	 * Update usable space statistics.
 	 */
 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 		vdev_sync_done(vd, txg);
 
 	spa_update_dspace(spa);
 
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
 	 */
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
 	spa->spa_sync_pass = 0;
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
 
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
 	spa_async_dispatch_vd(spa);
 }
 
 /*
  * Sync all pools.  We don't want to hold the namespace lock across these
  * operations, so we take a reference on the spa_t and drop the lock during the
  * sync.
  */
 void
 spa_sync_allpools(void)
 {
 	spa_t *spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
 		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * ==========================================================================
  * Miscellaneous routines
  * ==========================================================================
  */
 
 /*
  * Remove all pools in the system.
  */
 void
 spa_evict_all(void)
 {
 	spa_t *spa;
 
 	/*
 	 * Remove all cached state.  All pools should be closed now,
 	 * so every spa in the AVL tree should be unreferenced.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
 		 * Stop async tasks.  The async thread may need to detach
 		 * a device that's been replaced, which requires grabbing
 		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		spa_async_suspend(spa);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 			spa_unload(spa);
 			spa_deactivate(spa);
 		}
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 vdev_t *
 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
 
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
 	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 	}
 
 	return (NULL);
 }
 
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * This should only be called for a non-faulted pool, and since a
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
 	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 }
 
 boolean_t
 spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	int i;
 	uint64_t spareguid;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++)
 		if (sav->sav_vdevs[i]->vdev_guid == guid)
 			return (B_TRUE);
 
 	for (i = 0; i < sav->sav_npending; i++) {
 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
 		    &spareguid) == 0 && spareguid == guid)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Check if a pool has an active shared spare device.
  * Note: reference count of an active spare is 2, as a spare and as a replace
  */
 static boolean_t
 spa_has_active_shared_spare(spa_t *spa)
 {
 	int i, refcnt;
 	uint64_t pool;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
 		    refcnt > 2)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Post a sysevent corresponding to the given event.  The 'name' must be one of
  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
  * in the userland libzpool, as we don't want consumers to misinterpret ztest
  * or zdb as real changes.
  */
 void
 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
 {
 #ifdef _KERNEL
 	sysevent_t		*ev;
 	sysevent_attr_list_t	*attr = NULL;
 	sysevent_value_t	value;
 	sysevent_id_t		eid;
 
 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
 	    SE_SLEEP);
 
 	value.value_type = SE_DATA_TYPE_STRING;
 	value.value.sv_string = spa_name(spa);
 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
 		goto done;
 
 	value.value_type = SE_DATA_TYPE_UINT64;
 	value.value.sv_uint64 = spa_guid(spa);
 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
 		goto done;
 
 	if (vd) {
 		value.value_type = SE_DATA_TYPE_UINT64;
 		value.value.sv_uint64 = vd->vdev_guid;
 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
 		    SE_SLEEP) != 0)
 			goto done;
 
 		if (vd->vdev_path) {
 			value.value_type = SE_DATA_TYPE_STRING;
 			value.value.sv_string = vd->vdev_path;
 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
 			    &value, SE_SLEEP) != 0)
 				goto done;
 		}
 	}
 
 	if (sysevent_attach_attributes(ev, attr) != 0)
 		goto done;
 	attr = NULL;
 
 	(void) log_sysevent(ev, SE_SLEEP, &eid);
 
 done:
 	if (attr)
 		sysevent_free_attr(attr);
 	sysevent_free(ev);
 #endif
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c	(revision 275782)
@@ -1,545 +1,545 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/utsname.h>
 #include <sys/sunddi.h>
 #include <sys/cred.h>
 #include "zfs_comutil.h"
 #ifdef _KERNEL
 #include <sys/cmn_err.h>
 #include <sys/zone.h>
 #endif
 
 /*
  * Routines to manage the on-disk history log.
  *
  * The history log is stored as a dmu object containing
  * <packed record length, record nvlist> tuples.
  *
  * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
  * "packed record length" is the packed length of the "record nvlist" stored
  * as a little endian uint64_t.
  *
  * The log is implemented as a ring buffer, though the original creation
  * of the pool ('zpool create') is never overwritten.
  *
  * The history log is tracked as object 'spa_t::spa_history'.  The bonus buffer
  * of 'spa_history' stores the offsets for logging/retrieving history as
  * 'spa_history_phys_t'.  'sh_pool_create_len' is the ending offset in bytes of
  * where the 'zpool create' record is stored.  This allows us to never
  * overwrite the original creation of the pool.  'sh_phys_max_off' is the
  * physical ending offset in bytes of the log.  This tells you the length of
  * the buffer. 'sh_eof' is the logical EOF (in bytes).  Whenever a record
  * is added, 'sh_eof' is incremented by the the size of the record.
  * 'sh_eof' is never decremented.  'sh_bof' is the logical BOF (in bytes).
  * This is where the consumer should start reading from after reading in
  * the 'zpool create' portion of the log.
  *
  * 'sh_records_lost' keeps track of how many records have been overwritten
  * and permanently lost.
  */
 
 /* convert a logical offset to physical */
 static uint64_t
 spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
 {
 	uint64_t phys_len;
 
 	phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
 	return ((log_off - shpp->sh_pool_create_len) % phys_len
 	    + shpp->sh_pool_create_len);
 }
 
 void
 spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
 {
 	dmu_buf_t *dbp;
 	spa_history_phys_t *shpp;
 	objset_t *mos = spa->spa_meta_objset;
 
 	ASSERT(spa->spa_history == 0);
 	spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
 	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
 	    sizeof (spa_history_phys_t), tx);
 
 	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_HISTORY, sizeof (uint64_t), 1,
 	    &spa->spa_history, tx) == 0);
 
 	VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
 	ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
 
 	shpp = dbp->db_data;
 	dmu_buf_will_dirty(dbp, tx);
 
 	/*
 	 * Figure out maximum size of history log.  We set it at
 	 * 0.1% of pool size, with a max of 1G and min of 128KB.
 	 */
 	shpp->sh_phys_max_off =
 	    metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
 	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
 	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
 
 	dmu_buf_rele(dbp, FTAG);
 }
 
 /*
  * Change 'sh_bof' to the beginning of the next record.
  */
 static int
 spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t firstread, reclen, phys_bof;
 	char buf[sizeof (reclen)];
 	int err;
 
 	phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
 	firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
 
 	if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
 	    buf, DMU_READ_PREFETCH)) != 0)
 		return (err);
 	if (firstread != sizeof (reclen)) {
 		if ((err = dmu_read(mos, spa->spa_history,
 		    shpp->sh_pool_create_len, sizeof (reclen) - firstread,
 		    buf + firstread, DMU_READ_PREFETCH)) != 0)
 			return (err);
 	}
 
 	reclen = LE_64(*((uint64_t *)buf));
 	shpp->sh_bof += reclen + sizeof (reclen);
 	shpp->sh_records_lost++;
 	return (0);
 }
 
 static int
 spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
     dmu_tx_t *tx)
 {
 	uint64_t firstwrite, phys_eof;
 	objset_t *mos = spa->spa_meta_objset;
 	int err;
 
 	ASSERT(MUTEX_HELD(&spa->spa_history_lock));
 
 	/* see if we need to reset logical BOF */
 	while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
 	    (shpp->sh_eof - shpp->sh_bof) <= len) {
 		if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
 			return (err);
 		}
 	}
 
 	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
 	firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
 	shpp->sh_eof += len;
 	dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
 
 	len -= firstwrite;
 	if (len > 0) {
 		/* write out the rest at the beginning of physical file */
 		dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
 		    len, (char *)buf + firstwrite, tx);
 	}
 
 	return (0);
 }
 
 static char *
 spa_history_zone(void)
 {
 #ifdef _KERNEL
 	/* XXX: pr_hostname can be changed by default from within a jail! */
 	if (jailed(curthread->td_ucred))
 		return (curthread->td_ucred->cr_prison->pr_hostname);
 #endif
 	return (NULL);
 }
 
 /*
  * Write out a history event.
  */
 /*ARGSUSED*/
 static void
 spa_history_log_sync(void *arg, dmu_tx_t *tx)
 {
 	nvlist_t	*nvl = arg;
 	spa_t		*spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t	*mos = spa->spa_meta_objset;
 	dmu_buf_t	*dbp;
 	spa_history_phys_t *shpp;
 	size_t		reclen;
 	uint64_t	le_len;
 	char		*record_packed = NULL;
 	int		ret;
 
 	/*
 	 * If we have an older pool that doesn't have a command
 	 * history object, create it now.
 	 */
 	mutex_enter(&spa->spa_history_lock);
 	if (!spa->spa_history)
 		spa_history_create_obj(spa, tx);
 	mutex_exit(&spa->spa_history_lock);
 
 	/*
 	 * Get the offset of where we need to write via the bonus buffer.
 	 * Update the offset when the write completes.
 	 */
 	VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
 	shpp = dbp->db_data;
 
 	dmu_buf_will_dirty(dbp, tx);
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(dbp, &doi);
 		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
 	}
 #endif
 
 	fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
 #ifdef _KERNEL
 	fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename);
 #endif
 	if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
 		zfs_dbgmsg("command: %s",
 		    fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
 	} else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
 		if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
 			zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
 			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
 			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
 		} else {
 			zfs_dbgmsg("txg %lld %s %s",
 			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
 		}
 	} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
 		zfs_dbgmsg("ioctl %s",
 		    fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
 	}
 
 	record_packed = fnvlist_pack(nvl, &reclen);
 
 	mutex_enter(&spa->spa_history_lock);
 
 	/* write out the packed length as little endian */
 	le_len = LE_64((uint64_t)reclen);
 	ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
 	if (!ret)
 		ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
 
 	/* The first command is the create, which we keep forever */
 	if (ret == 0 && shpp->sh_pool_create_len == 0 &&
 	    nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
 		shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
 	}
 
 	mutex_exit(&spa->spa_history_lock);
 	fnvlist_pack_free(record_packed, reclen);
 	dmu_buf_rele(dbp, FTAG);
 	fnvlist_free(nvl);
 }
 
 /*
  * Write out a history event.
  */
 int
 spa_history_log(spa_t *spa, const char *msg)
 {
 	int err;
 	nvlist_t *nvl = fnvlist_alloc();
 
 	fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
 	err = spa_history_log_nvl(spa, nvl);
 	fnvlist_free(nvl);
 	return (err);
 }
 
 int
 spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
 {
 	int err = 0;
 	dmu_tx_t *tx;
 	nvlist_t *nvarg;
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY)
 		return (EINVAL);
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
 		return (SET_ERROR(EINVAL));
 
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	nvarg = fnvlist_dup(nvl);
 	if (spa_history_zone() != NULL) {
 		fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
 		    spa_history_zone());
 	}
 	fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
 
 	/* Kick this off asynchronously; errors are ignored. */
 	dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
 	    nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
 	dmu_tx_commit(tx);
 
 	/* spa_history_log_sync will free nvl */
 	return (err);
 
 }
 
 /*
  * Read out the command history.
  */
 int
 spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	dmu_buf_t *dbp;
 	uint64_t read_len, phys_read_off, phys_eof;
 	uint64_t leftover = 0;
 	spa_history_phys_t *shpp;
 	int err;
 
 	/*
 	 * If the command history doesn't exist (older pool),
 	 * that's ok, just return ENOENT.
 	 */
 	if (!spa->spa_history)
 		return (SET_ERROR(ENOENT));
 
 	/*
 	 * The history is logged asynchronously, so when they request
 	 * the first chunk of history, make sure everything has been
 	 * synced to disk so that we get it.
 	 */
 	if (*offp == 0 && spa_writeable(spa))
 		txg_wait_synced(spa_get_dsl(spa), 0);
 
 	if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
 		return (err);
 	shpp = dbp->db_data;
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(dbp, &doi);
 		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
 	}
 #endif
 
 	mutex_enter(&spa->spa_history_lock);
 	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
 
 	if (*offp < shpp->sh_pool_create_len) {
 		/* read in just the zpool create history */
 		phys_read_off = *offp;
 		read_len = MIN(*len, shpp->sh_pool_create_len -
 		    phys_read_off);
 	} else {
 		/*
 		 * Need to reset passed in offset to BOF if the passed in
 		 * offset has since been overwritten.
 		 */
 		*offp = MAX(*offp, shpp->sh_bof);
 		phys_read_off = spa_history_log_to_phys(*offp, shpp);
 
 		/*
 		 * Read up to the minimum of what the user passed down or
 		 * the EOF (physical or logical).  If we hit physical EOF,
 		 * use 'leftover' to read from the physical BOF.
 		 */
 		if (phys_read_off <= phys_eof) {
 			read_len = MIN(*len, phys_eof - phys_read_off);
 		} else {
 			read_len = MIN(*len,
 			    shpp->sh_phys_max_off - phys_read_off);
 			if (phys_read_off + *len > shpp->sh_phys_max_off) {
 				leftover = MIN(*len - read_len,
 				    phys_eof - shpp->sh_pool_create_len);
 			}
 		}
 	}
 
 	/* offset for consumer to use next */
 	*offp += read_len + leftover;
 
 	/* tell the consumer how much you actually read */
 	*len = read_len + leftover;
 
 	if (read_len == 0) {
 		mutex_exit(&spa->spa_history_lock);
 		dmu_buf_rele(dbp, FTAG);
 		return (0);
 	}
 
 	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
 	    DMU_READ_PREFETCH);
 	if (leftover && err == 0) {
 		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
 		    leftover, buf + read_len, DMU_READ_PREFETCH);
 	}
 	mutex_exit(&spa->spa_history_lock);
 
 	dmu_buf_rele(dbp, FTAG);
 	return (err);
 }
 
 /*
  * The nvlist will be consumed by this call.
  */
 static void
 log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
     dmu_tx_t *tx, const char *fmt, va_list adx)
 {
 	char *msg;
 	va_list adx2;
 
 	/*
 	 * If this is part of creating a pool, not everything is
 	 * initialized yet, so don't bother logging the internal events.
 	 * Likewise if the pool is not writeable.
 	 */
 	if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) {
 		fnvlist_free(nvl);
 		return;
 	}
 
 	va_copy(adx2, adx);
 
 	msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
 	(void) vsprintf(msg, fmt, adx2);
 	fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
 	strfree(msg);
 
 	va_end(adx2);
 
 	fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
 	fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
 
 	if (dmu_tx_is_syncing(tx)) {
 		spa_history_log_sync(nvl, tx);
 	} else {
 		dsl_sync_task_nowait(spa_get_dsl(spa),
 		    spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
 	}
 	/* spa_history_log_sync() will free nvl */
 }
 
 void
 spa_history_log_internal(spa_t *spa, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...)
 {
 	dmu_tx_t *htx = tx;
 	va_list adx;
 
 	/* create a tx if we didn't get one */
 	if (tx == NULL) {
 		htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 		if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
 			dmu_tx_abort(htx);
 			return;
 		}
 	}
 
 	va_start(adx, fmt);
 	log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
 	va_end(adx);
 
 	/* if we didn't get a tx from the caller, commit the one we made */
 	if (tx == NULL)
 		dmu_tx_commit(htx);
 }
 
 void
 spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...)
 {
 	va_list adx;
 	char namebuf[MAXNAMELEN];
 	nvlist_t *nvl = fnvlist_alloc();
 
 	ASSERT(tx != NULL);
 
 	dsl_dataset_name(ds, namebuf);
 	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
 	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
 
 	va_start(adx, fmt);
 	log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
 	va_end(adx);
 }
 
 void
 spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...)
 {
 	va_list adx;
 	char namebuf[MAXNAMELEN];
 	nvlist_t *nvl = fnvlist_alloc();
 
 	ASSERT(tx != NULL);
 
 	dsl_dir_name(dd, namebuf);
 	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
 	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
-	    dd->dd_phys->dd_head_dataset_obj);
+	    dsl_dir_phys(dd)->dd_head_dataset_obj);
 
 	va_start(adx, fmt);
 	log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
 	va_end(adx);
 }
 
 void
 spa_history_log_version(spa_t *spa, const char *operation)
 {
 	spa_history_log_internal(spa, operation, NULL,
 	    "pool version %llu; software version %llu/%d; uts %s %s %s %s",
 	    (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION,
 	    utsname.nodename, utsname.release, utsname.version,
 	    utsname.machine);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	(revision 275782)
@@ -1,374 +1,373 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #ifndef	_SYS_DBUF_H
 #define	_SYS_DBUF_H
 
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 #include <sys/zrlock.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #define	IN_DMU_SYNC 2
 
 /*
  * define flags for dbuf_read
  */
 
 #define	DB_RF_MUST_SUCCEED	(1 << 0)
 #define	DB_RF_CANFAIL		(1 << 1)
 #define	DB_RF_HAVESTRUCT	(1 << 2)
 #define	DB_RF_NOPREFETCH	(1 << 3)
 #define	DB_RF_NEVERWAIT		(1 << 4)
 #define	DB_RF_CACHED		(1 << 5)
 
 /*
  * The simplified state transition diagram for dbufs looks like:
  *
  *		+----> READ ----+
  *		|		|
  *		|		V
  *  (alloc)-->UNCACHED	     CACHED-->EVICTING-->(free)
  *		|		^	 ^
  *		|		|	 |
  *		+----> FILL ----+	 |
  *		|			 |
  *		|			 |
  *		+--------> NOFILL -------+
  *
  * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
  * to find all dbufs in a range of a dnode and must be less than any other
  * dbuf_states_t (see comment on dn_dbufs in dnode.h).
  */
 typedef enum dbuf_states {
 	DB_SEARCH = -1,
 	DB_UNCACHED,
 	DB_FILL,
 	DB_NOFILL,
 	DB_READ,
 	DB_CACHED,
 	DB_EVICTING
 } dbuf_states_t;
 
 struct dnode;
 struct dmu_tx;
 
 /*
  * level = 0 means the user data
  * level = 1 means the single indirect block
  * etc.
  */
 
 struct dmu_buf_impl;
 
 typedef enum override_states {
 	DR_NOT_OVERRIDDEN,
 	DR_IN_DMU_SYNC,
 	DR_OVERRIDDEN
 } override_states_t;
 
 typedef struct dbuf_dirty_record {
 	/* link on our parents dirty list */
 	list_node_t dr_dirty_node;
 
 	/* transaction group this data will sync in */
 	uint64_t dr_txg;
 
 	/* zio of outstanding write IO */
 	zio_t *dr_zio;
 
 	/* pointer back to our dbuf */
 	struct dmu_buf_impl *dr_dbuf;
 
 	/* pointer to next dirty record */
 	struct dbuf_dirty_record *dr_next;
 
 	/* pointer to parent dirty record */
 	struct dbuf_dirty_record *dr_parent;
 
 	/* How much space was changed to dsl_pool_dirty_space() for this? */
 	unsigned int dr_accounted;
 
 	union dirty_types {
 		struct dirty_indirect {
 
 			/* protect access to list */
 			kmutex_t dr_mtx;
 
 			/* Our list of dirty children */
 			list_t dr_children;
 		} di;
 		struct dirty_leaf {
 
 			/*
 			 * dr_data is set when we dirty the buffer
 			 * so that we can retain the pointer even if it
 			 * gets COW'd in a subsequent transaction group.
 			 */
 			arc_buf_t *dr_data;
 			blkptr_t dr_overridden_by;
 			override_states_t dr_override_state;
 			uint8_t dr_copies;
 			boolean_t dr_nopwrite;
 		} dl;
 	} dt;
 } dbuf_dirty_record_t;
 
 typedef struct dmu_buf_impl {
 	/*
 	 * The following members are immutable, with the exception of
 	 * db.db_data, which is protected by db_mtx.
 	 */
 
 	/* the publicly visible structure */
 	dmu_buf_t db;
 
 	/* the objset we belong to */
 	struct objset *db_objset;
 
 	/*
 	 * handle to safely access the dnode we belong to (NULL when evicted)
 	 */
 	struct dnode_handle *db_dnode_handle;
 
 	/*
 	 * our parent buffer; if the dnode points to us directly,
 	 * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
 	 * only accessed by sync thread ???
 	 * (NULL when evicted)
 	 * May change from NULL to non-NULL under the protection of db_mtx
 	 * (see dbuf_check_blkptr())
 	 */
 	struct dmu_buf_impl *db_parent;
 
 	/*
 	 * link for hash table of all dmu_buf_impl_t's
 	 */
 	struct dmu_buf_impl *db_hash_next;
 
 	/* our block number */
 	uint64_t db_blkid;
 
 	/*
 	 * Pointer to the blkptr_t which points to us. May be NULL if we
 	 * don't have one yet. (NULL when evicted)
 	 */
 	blkptr_t *db_blkptr;
 
 	/*
 	 * Our indirection level.  Data buffers have db_level==0.
 	 * Indirect buffers which point to data buffers have
 	 * db_level==1. etc.  Buffers which contain dnodes have
 	 * db_level==0, since the dnodes are stored in a file.
 	 */
 	uint8_t db_level;
 
 	/* db_mtx protects the members below */
 	kmutex_t db_mtx;
 
 	/*
 	 * Current state of the buffer
 	 */
 	dbuf_states_t db_state;
 
 	/*
 	 * Refcount accessed by dmu_buf_{hold,rele}.
 	 * If nonzero, the buffer can't be destroyed.
 	 * Protected by db_mtx.
 	 */
 	refcount_t db_holds;
 
 	/* buffer holding our data */
 	arc_buf_t *db_buf;
 
 	kcondvar_t db_changed;
 	dbuf_dirty_record_t *db_data_pending;
 
 	/* pointer to most recent dirty record for this buffer */
 	dbuf_dirty_record_t *db_last_dirty;
 
 	/*
 	 * Our link on the owner dnodes's dn_dbufs list.
 	 * Protected by its dn_dbufs_mtx.
 	 */
 	avl_node_t db_link;
 
 	/* Data which is unique to data (leaf) blocks: */
 
 	/* stuff we store for the user (see dmu_buf_set_user) */
 	void *db_user_ptr;
-	void **db_user_data_ptr_ptr;
 	dmu_buf_evict_func_t *db_evict_func;
 
 	uint8_t db_immediate_evict;
 	uint8_t db_freed_in_flight;
 
 	uint8_t db_dirtycnt;
 } dmu_buf_impl_t;
 
 /* Note: the dbuf hash table is exposed only for the mdb module */
 #define	DBUF_MUTEXES 256
 #define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
 typedef struct dbuf_hash_table {
 	uint64_t hash_table_mask;
 	dmu_buf_impl_t **hash_table;
 	kmutex_t hash_mutexes[DBUF_MUTEXES];
 } dbuf_hash_table_t;
 
 
 uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
 void dbuf_create_bonus(struct dnode *dn);
 int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
 void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
 
 void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
 
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
     void *tag);
 int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
     void *tag, dmu_buf_impl_t **dbp);
 
 void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
 void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
 
 dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
 
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
 void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
 
 void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
 void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
 void dbuf_release_bp(dmu_buf_impl_t *db);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
     struct dmu_tx *);
 
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 
 #define	DB_DNODE(_db)		((_db)->db_dnode_handle->dnh_dnode)
 #define	DB_DNODE_LOCK(_db)	((_db)->db_dnode_handle->dnh_zrlock)
 #define	DB_DNODE_ENTER(_db)	(zrl_add(&DB_DNODE_LOCK(_db)))
 #define	DB_DNODE_EXIT(_db)	(zrl_remove(&DB_DNODE_LOCK(_db)))
 #define	DB_DNODE_HELD(_db)	(!zrl_is_zero(&DB_DNODE_LOCK(_db)))
 
 void dbuf_init(void);
 void dbuf_fini(void);
 
 boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
 
 #define	DBUF_GET_BUFC_TYPE(_db)	\
 	(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 #define	DBUF_IS_CACHEABLE(_db)						\
 	((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
 	(dbuf_is_metadata(_db) &&					\
 	((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
 
 #define	DBUF_IS_L2CACHEABLE(_db)					\
 	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
 	(dbuf_is_metadata(_db) &&					\
 	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
 #define	DBUF_IS_L2COMPRESSIBLE(_db)					\
 	((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF ||		\
 	(dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE))
 
 #ifdef ZFS_DEBUG
 
 /*
  * There should be a ## between the string literal and fmt, to make it
  * clear that we're joining two strings together, but gcc does not
  * support that preprocessor token.
  */
 #define	dprintf_dbuf(dbuf, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 	char __db_buf[32]; \
 	uint64_t __db_obj = (dbuf)->db.db_object; \
 	if (__db_obj == DMU_META_DNODE_OBJECT) \
 		(void) strcpy(__db_buf, "mdn"); \
 	else \
 		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
 		    (u_longlong_t)__db_obj); \
 	dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
 	    "obj=%s lvl=%u blkid=%lld " fmt, \
 	    __db_buf, (dbuf)->db_level, \
 	    (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 
 #define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
 	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
 	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	}							\
 _NOTE(CONSTCOND) } while (0)
 
 #define	DBUF_VERIFY(db)	dbuf_verify(db)
 
 #else
 
 #define	dprintf_dbuf(db, fmt, ...)
 #define	dprintf_dbuf_bp(db, bp, fmt, ...)
 #define	DBUF_VERIFY(db)
 
 #endif
 
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DBUF_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(revision 275782)
@@ -1,842 +1,835 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2013 DEY Storage Systems, Inc.
  * Copyright 2014 HybridCluster. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
  *
  * The DMU also interacts with the SPA.  That interface is described in
  * dmu_spa.h.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/cred.h>
 #include <sys/time.h>
 #include <sys/fs/zfs.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct uio;
 struct xuio;
 struct page;
 struct vnode;
 struct spa;
 struct zilog;
 struct zio;
 struct blkptr;
 struct zap_cursor;
 struct dsl_dataset;
 struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
 struct zbookmark_phys;
 struct spa;
 struct nvlist;
 struct arc_buf;
 struct zio_prop;
 struct sa_handle;
 struct file;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 
 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
 	DMU_BSWAP_UINT16,
 	DMU_BSWAP_UINT32,
 	DMU_BSWAP_UINT64,
 	DMU_BSWAP_ZAP,
 	DMU_BSWAP_DNODE,
 	DMU_BSWAP_OBJSET,
 	DMU_BSWAP_ZNODE,
 	DMU_BSWAP_OLDACL,
 	DMU_BSWAP_ACL,
 	/*
 	 * Allocating a new byteswap type number makes the on-disk format
 	 * incompatible with any other format that uses the same number.
 	 *
 	 * Data can usually be structured to work with one of the
 	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
 	 */
 	DMU_BSWAP_NUMFUNCS
 } dmu_object_byteswap_t;
 
 #define	DMU_OT_NEWTYPE 0x80
 #define	DMU_OT_METADATA 0x40
 #define	DMU_OT_BYTESWAP_MASK 0x3f
 
 /*
  * Defines a uint8_t object type. Object types specify if the data
  * in the object is metadata (boolean) and how to byteswap the data
  * (dmu_object_byteswap_t).
  */
 #define	DMU_OT(byteswap, metadata) \
 	(DMU_OT_NEWTYPE | \
 	((metadata) ? DMU_OT_METADATA : 0) | \
 	((byteswap) & DMU_OT_BYTESWAP_MASK))
 
 #define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
 	(ot) < DMU_OT_NUMTYPES)
 
 #define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_METADATA) : \
 	dmu_ot[(ot)].ot_metadata)
 
 /*
  * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
  * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
  * is repurposed for embedded BPs.
  */
 #define	DMU_OT_HAS_FILL(ot) \
 	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
 
 #define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) : \
 	dmu_ot[(ot)].ot_byteswap)
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
 	/* general: */
 	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
 	DMU_OT_BPOBJ,			/* UINT64 */
 	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
 	/* zil: */
 	DMU_OT_INTENT_LOG,		/* UINT64 */
 	/* dmu: */
 	DMU_OT_DNODE,			/* DNODE */
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DIR,			/* UINT64 */
 	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
 	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
 	DMU_OT_OLDACL,			/* Old ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
 	DMU_OT_UNLINKED_SET,		/* ZAP */
 	/* zvol: */
 	DMU_OT_ZVOL,			/* UINT8 */
 	DMU_OT_ZVOL_PROP,		/* ZAP */
 	/* other; for testing only! */
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
 	/* new object types: */
 	DMU_OT_ERROR_LOG,		/* ZAP */
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
 	DMU_OT_DSL_PERMS,		/* ZAP */
 	DMU_OT_ACL,			/* ACL */
 	DMU_OT_SYSACL,			/* SYSACL */
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
 	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
 	DMU_OT_DDT_ZAP,			/* ZAP */
 	DMU_OT_DDT_STATS,		/* ZAP */
 	DMU_OT_SA,			/* System attr */
 	DMU_OT_SA_MASTER_NODE,		/* ZAP */
 	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
 	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
 	DMU_OT_SCAN_XLATE,		/* ZAP */
 	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
 	DMU_OT_DEADLIST,		/* ZAP */
 	DMU_OT_DEADLIST_HDR,		/* UINT64 */
 	DMU_OT_DSL_CLONES,		/* ZAP */
 	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
 	/*
 	 * Do not allocate new object types here. Doing so makes the on-disk
 	 * format incompatible with any other format that uses the same object
 	 * type number.
 	 *
 	 * When creating an object which does not have one of the above types
 	 * use the DMU_OTN_* type with the correct byteswap and metadata
 	 * values.
 	 *
 	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
 	 * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
 	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
 	 * and DMU_OTN_* types).
 	 */
 	DMU_OT_NUMTYPES,
 
 	/*
 	 * Names for valid types declared with DMU_OT().
 	 */
 	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
 	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
 	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
 	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
 	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
 	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
 	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
 	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
 	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
 	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
 } dmu_object_type_t;
 
 typedef enum txg_how {
 	TXG_WAIT = 1,
 	TXG_NOWAIT,
 	TXG_WAITED,
 } txg_how_t;
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
 void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
 
 /*
  * artificial blkids for bonus buffer and spill blocks
  */
 #define	DMU_BONUS_BLKID		(-1ULL)
 #define	DMU_SPILL_BLKID		(-2ULL)
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
 
 void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
 int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
     struct nvlist *snaps);
 int dmu_objset_clone(const char *name, const char *origin);
 int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
     struct nvlist *errlist);
 int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
 int dmu_objset_snapshot_tmp(const char *, const char *, int);
 int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
 	uint64_t db_offset;		/* byte offset in this object */
 	uint64_t db_size;		/* size of buffer in bytes */
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
 typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
 #define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
 #define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
 #define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
  *
  * The transaction must be assigned to a txg.  The newly allocated
  * object will be "held" in the transaction (ie. you can modify the
  * newly allocated object in this transaction).
  *
  * dmu_object_alloc() chooses an object and returns it in *objectp.
  *
  * dmu_object_claim() allocates a specific object number.  If that
  * number is already allocated, it fails and returns EEXIST.
  *
  * Return 0 on success, or ENOSPC or EEXIST as specified above.
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 
 /*
  * Free an object from this objset.
  *
  * The object's data will be freed as well (ie. you don't need to call
  * dmu_free(object, 0, -1, tx)).
  *
  * The object need not be held in the transaction.
  *
  * If there are any holds on this object's buffers (via dmu_buf_hold()),
  * or tx holds on the object (via dmu_tx_hold_object()), you can not
  * free it; it fails and returns EBUSY.
  *
  * If the object is not allocated, it fails and returns ENOENT.
  *
  * Return 0 on success, or EBUSY or ENOENT as specified above.
  */
 int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Find the next allocated or free object.
  *
  * The objectp parameter is in-out.  It will be updated to be the next
  * object which is allocated.  Ignore objects which have not been
  * modified since txg.
  *
  * XXX Can only be called on a objset with no dirty data.
  *
  * Returns 0 on success, or ENOENT if there are no more objects.
  */
 int dmu_object_next(objset_t *os, uint64_t *objectp,
     boolean_t hole, uint64_t txg);
 
 /*
  * Set the data blocksize for an object.
  *
  * The object cannot have any blocks allcated beyond the first.  If
  * the first block is allocated already, the new size must be greater
  * than the current block size.  If these conditions are not met,
  * ENOTSUP will be returned.
  *
  * Returns 0 on success, or EBUSY if there are any holds on the object
  * contents, or ENOTSUP as described above.
  */
 int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
     int ibs, dmu_tx_t *tx);
 
 /*
  * Set the checksum property on a dnode.  The new checksum algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx);
 
 /*
  * Set the compress property on a dnode.  The new compression algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx);
 
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
 #define	WP_NOFILL	0x1
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4
 
 void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
     struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
  * data.  As with any normal buffer, you must call dmu_buf_read() to
  * read db_data, dmu_buf_will_dirty() before modifying it, and the
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release your hold with dmu_buf_rele().
  *
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
 dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
 int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 
 /*
  * Special spill buffer support used by "SA" framework
  */
 
 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
     void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
  * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
  * that it will remain in memory.  You must release the hold with
  * dmu_buf_rele().  You musn't access the dmu_buf_t after releasing your
  * hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
  *
  * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
  * on the returned buffer before reading or writing the buffer's
  * db_data.  The comments for those routines describe what particular
  * operations are valid after calling them.
  *
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **, int flags);
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
 /*
  * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
  * range of an object.  A pointer to an array of dmu_buf_t*'s is
  * returned (in *dbpp).
  *
  * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
  * frees the array.  The hold on the array of buffers MUST be released
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
 /*
  * Returns NULL on success, or the existing user ptr if it's already
  * been set.
  *
  * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
  *
- * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
- * will be set to db->db_data when you are allowed to access it.  Note
- * that db->db_data (the pointer) can change when you do dmu_buf_read(),
- * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
- * *user_data_ptr_ptr will be set to the new value when it changes.
- *
  * If non-NULL, pageout func will be called when this buffer is being
  * excised from the cache, so that you can clean up the data structure
  * pointed to by user_ptr.
  *
  * dmu_evict_user() will call the pageout func for all buffers in a
  * objset with a given pageout func.
  */
-void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
+void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr,
     dmu_buf_evict_func_t *pageout_func);
 /*
  * set_user_ie is the same as set_user, but request immediate eviction
  * when hold count goes to zero.
  */
 void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
-void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
-    void *user_ptr, void *user_data_ptr_ptr,
     dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
+    void *user_ptr, dmu_buf_evict_func_t *pageout_func);
 void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
 
 /*
  * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 /*
  * Returns the blkptr associated with this dbuf, or NULL if not set.
  */
 struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
 
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
  * dmu_tx_assign()).  The buffer's object must be held in the tx
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 
 /*
  * Tells if the given dbuf is freeable.
  */
 boolean_t dmu_buf_freeable(dmu_buf_t *);
 
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
  * been assigned, you can modify buffers which belong to held objects as
  * part of this transaction.  You can't modify buffers before the
  * transaction has been assigned; you can't modify buffers which don't
  * belong to objects which this transaction holds; you can't hold
  * objects once the transaction has been assigned.  You may hold an
  * object which you are going to free (with dmu_object_free()), but you
  * don't have to.
  *
  * You can abort the transaction before it has been assigned.
  *
  * Note that you may hold buffers (with dmu_buf_hold) at any time,
  * regardless of transaction state.
  */
 
 #define	DMU_NEW_OBJECT	(-1ULL)
 #define	DMU_OBJECT_END	(-1ULL)
 
 dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_mark_netfree(dmu_tx_t *tx);
 
 /*
  * To register a commit callback, dmu_tx_callback_register() must be called.
  *
  * dcb_data is a pointer to caller private data that is passed on as a
  * callback parameter. The caller is responsible for properly allocating and
  * freeing it.
  *
  * When registering a callback, the transaction must be already created, but
  * it cannot be committed or aborted. It can be assigned to a txg or not.
  *
  * The callback will be called after the transaction has been safely written
  * to stable storage and will also be called if the dmu_tx is aborted.
  * If there is any error which prevents the transaction from being committed to
  * disk, the callback will be called with a value of error != 0.
  */
 typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
 
 void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
     void *dcb_data);
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * -1, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
 int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size);
 int dmu_free_long_object(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
  *
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
 #define	DMU_READ_PREFETCH	0 /* prefetch */
 #define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf, uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 #ifdef _KERNEL
 #ifdef sun
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
 #else
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
 #endif
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
     dmu_tx_t *tx);
 int dmu_xuio_init(struct xuio *uio, int niov);
 void dmu_xuio_fini(struct xuio *uio);
 int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
     size_t n);
 int dmu_xuio_cnt(struct xuio *uio);
 struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
 void dmu_xuio_clear(struct xuio *uio, int i);
 void xuio_stat_wbuf_copied();
 void xuio_stat_wbuf_nocopy();
 
 extern int zfs_prefetch_disable;
 extern int zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
  */
 void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t len);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
 	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_nblkptr;
 	uint8_t doi_pad[4];
 	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
 	uint64_t doi_max_offset;
 	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void arc_byteswap_func_t(void *buf, size_t size);
 
 typedef struct dmu_object_type_info {
 	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
 	char			*ot_name;
 } dmu_object_type_info_t;
 
 typedef struct dmu_object_byteswap_info {
 	arc_byteswap_func_t	*ob_func;
 	char			*ob_name;
 } dmu_object_byteswap_info_t;
 
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
 extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 
 /*
  * Get information on a DMU object.
  *
  * Return 0 on success or ENOENT if object is not allocated.
  *
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
 void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*
  * Like dmu_object_info_from_db, but faster still when you only care about
  * the size.  This is specifically optimized for zfs_getattr().
  */
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
 	uint64_t dds_guid;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
 	char dds_origin[MAXNAMELEN];
 } dmu_objset_stats_t;
 
 /*
  * Get stats on a dataset.
  */
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 
 /*
  * Add entries to the nvlist for all the objset's properties.  See
  * zfs_prop_table[] and zfs(1m) for details on the properties.
  */
 void dmu_objset_stats(objset_t *os, struct nvlist *nv);
 
 /*
  * Get the space usage statistics for statvfs().
  *
  * refdbytes is the amount of space "referenced" by this objset.
  * availbytes is the amount of space available to this objset, taking
  * into account quotas & reservations, assuming that no other objsets
  * use the space first.  These values correspond to the 'referenced' and
  * 'available' properties, described in the zfs(1m) manpage.
  *
  * usedobjs and availobjs are the number of objects currently allocated,
  * and available.
  */
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 
 /*
  * The fsid_guid is a 56-bit ID that can change to avoid collisions.
  * (Contrast with the ds_guid which is a 64-bit ID that will never
  * change, so there is a small probability that it will collide.)
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
 /*
  * Get the [cm]time for an objset's snapshot dir
  */
 timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
 extern struct zilog *dmu_objset_zil(objset_t *os);
 extern struct dsl_pool *dmu_objset_pool(objset_t *os);
 extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
 extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
     int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
 typedef int objset_used_cb_t(dmu_object_type_t bonustype,
     void *bonus, uint64_t *userp, uint64_t *groupp);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
     objset_used_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
 extern void *dmu_objset_get_user(objset_t *os);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
  * In the absence of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
 
 /*
  * {zfs,zvol,ztest}_get_done() args
  */
 typedef struct zgd {
 	struct zilog	*zgd_zilog;
 	struct blkptr	*zgd_bp;
 	dmu_buf_t	*zgd_db;
 	struct rl	*zgd_rl;
 	void		*zgd_private;
 } zgd_t;
 
 typedef void dmu_sync_cb_t(zgd_t *arg, int error);
 int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
  * Return found offset in *off. Return ESRCH for end of file.
  */
 int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
     uint64_t *off);
 
 /*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
 extern void dmu_fini(void);
 
 typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
     uint64_t object, uint64_t offset, int len);
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
     struct file *fp, offset_t *offp);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
 extern int zfs_mdcomp_disable;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_DMU_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	(revision 275782)
@@ -1,314 +1,322 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_DATASET_H
 #define	_SYS_DSL_DATASET_H
 
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/zio.h>
 #include <sys/bplist.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/refcount.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct dsl_dataset;
 struct dsl_dir;
 struct dsl_pool;
 
 #define	DS_FLAG_INCONSISTENT	(1ULL<<0)
 #define	DS_IS_INCONSISTENT(ds)	\
-	((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
+	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
 
 /*
  * Do not allow this dataset to be promoted.
  */
 #define	DS_FLAG_NOPROMOTE	(1ULL<<1)
 
 /*
  * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
  * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
  * refquota/refreservations).
  */
 #define	DS_FLAG_UNIQUE_ACCURATE	(1ULL<<2)
 
 /*
  * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
  * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
  */
 #define	DS_FLAG_DEFER_DESTROY	(1ULL<<3)
 #define	DS_IS_DEFER_DESTROY(ds)	\
-	((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
 
 /*
  * DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
  * They should be of the format <reverse-dns>:<field>.
  */
 
 /*
  * This field's value is the object ID of a zap object which contains the
  * bookmarks of this dataset.  If it is present, then this dataset is counted
  * in the refcount of the SPA_FEATURES_BOOKMARKS feature.
  */
 #define	DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
 
 /*
  * This field is present (with value=0) if this dataset may contain large
  * blocks (>128KB).  If it is present, then this dataset
  * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
  */
 #define	DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
 
 /*
  * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
  * name lookups should be performed case-insensitively.
  */
 #define	DS_FLAG_CI_DATASET	(1ULL<<16)
 
 #define	DS_CREATE_FLAG_NODIRTY	(1ULL<<24)
 
 typedef struct dsl_dataset_phys {
 	uint64_t ds_dir_obj;		/* DMU_OT_DSL_DIR */
 	uint64_t ds_prev_snap_obj;	/* DMU_OT_DSL_DATASET */
 	uint64_t ds_prev_snap_txg;
 	uint64_t ds_next_snap_obj;	/* DMU_OT_DSL_DATASET */
 	uint64_t ds_snapnames_zapobj;	/* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
 	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
 	uint64_t ds_creation_time;	/* seconds since 1970 */
 	uint64_t ds_creation_txg;
 	uint64_t ds_deadlist_obj;	/* DMU_OT_DEADLIST */
 	/*
 	 * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes
 	 * include all blocks referenced by this dataset, including those
 	 * shared with any other datasets.
 	 */
 	uint64_t ds_referenced_bytes;
 	uint64_t ds_compressed_bytes;
 	uint64_t ds_uncompressed_bytes;
 	uint64_t ds_unique_bytes;	/* only relevant to snapshots */
 	/*
 	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
 	 * collisions.  The ds_guid is a 64-bit ID that will never
 	 * change, so there is a small probability that it will collide.
 	 */
 	uint64_t ds_fsid_guid;
 	uint64_t ds_guid;
 	uint64_t ds_flags;		/* DS_FLAG_* */
 	blkptr_t ds_bp;
 	uint64_t ds_next_clones_obj;	/* DMU_OT_DSL_CLONES */
 	uint64_t ds_props_obj;		/* DMU_OT_DSL_PROPS for snaps */
 	uint64_t ds_userrefs_obj;	/* DMU_OT_USERREFS */
 	uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
 	/* Immutable: */
 	struct dsl_dir *ds_dir;
-	dsl_dataset_phys_t *ds_phys;
 	dmu_buf_t *ds_dbuf;
 	uint64_t ds_object;
 	uint64_t ds_fsid_guid;
 
 	/* only used in syncing context, only valid for non-snapshots: */
 	struct dsl_dataset *ds_prev;
 	uint64_t ds_bookmarks;  /* DMU_OTN_ZAP_METADATA */
 	boolean_t ds_large_blocks;
 	boolean_t ds_need_large_blocks;
 
 	/* has internal locking: */
 	dsl_deadlist_t ds_deadlist;
 	bplist_t ds_pending_deadlist;
 
 	/* protected by lock on pool's dp_dirty_datasets list */
 	txg_node_t ds_dirty_link;
 	list_node_t ds_synced_link;
 
 	/*
 	 * ds_phys->ds_<accounting> is also protected by ds_lock.
 	 * Protected by ds_lock:
 	 */
 	kmutex_t ds_lock;
 	objset_t *ds_objset;
 	uint64_t ds_userrefs;
 	void *ds_owner;
 
 	/*
 	 * Long holds prevent the ds from being destroyed; they allow the
 	 * ds to remain held even after dropping the dp_config_rwlock.
 	 * Owning counts as a long hold.  See the comments above
 	 * dsl_pool_hold() for details.
 	 */
 	refcount_t ds_longholds;
 
 	/* no locking; only for making guesses */
 	uint64_t ds_trysnap_txg;
 
 	/* for objset_open() */
 	kmutex_t ds_opening_lock;
 
 	uint64_t ds_reserved;	/* cached refreservation */
 	uint64_t ds_quota;	/* cached refquota */
 
 	kmutex_t ds_sendstream_lock;
 	list_t ds_sendstreams;
 
 	/* Protected by ds_lock; keep at end of struct for better locality */
 	char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
 
+inline dsl_dataset_phys_t *
+dsl_dataset_phys(dsl_dataset_t *ds)
+{
+	return (ds->ds_dbuf->db_data);
+}
+
 /*
  * The max length of a temporary tag prefix is the number of hex digits
  * required to express UINT64_MAX plus one for the hyphen.
  */
 #define	MAX_TAG_PREFIX_LEN	17
 
-#define	dsl_dataset_is_snapshot(ds) \
-	((ds)->ds_phys->ds_num_children != 0)
+inline boolean_t
+dsl_dataset_is_snapshot(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_phys(ds)->ds_num_children != 0);
+}
 
 #define	DS_UNIQUE_IS_ACCURATE(ds)	\
-	(((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
+	((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
 
 int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
     dsl_dataset_t **dsp);
 int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **);
 void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
 int dsl_dataset_own(struct dsl_pool *dp, const char *name,
     void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
     void *tag, dsl_dataset_t **dsp);
 void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx);
 int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors);
 int dsl_dataset_promote(const char *name, char *conflsnap);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
     boolean_t force);
 int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
     minor_t cleanup_minor, const char *htag);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 
 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
 
 boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
     dsl_dataset_t *snap);
 
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx);
 int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx, boolean_t async);
 boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
     uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
 void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
 void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
 void dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
 int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
 int dsl_dataset_activate_large_blocks(const char *dsname);
 void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
 int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used,
     uint64_t *ref_rsrv);
 int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
     uint64_t quota);
 int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
     uint64_t reservation);
 
 boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
     uint64_t earlier_txg);
 void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
 
 int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx);
 void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, dmu_tx_t *tx);
 int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr);
 void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx);
 
 void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
     dmu_tx_t *tx);
 void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
 int dsl_dataset_get_snapname(dsl_dataset_t *ds);
 int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
     uint64_t *value);
 int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
     boolean_t adj_cnt);
 void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
     zprop_source_t source, uint64_t value, dmu_tx_t *tx);
 void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
 int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
 	dsl_dataset_name(ds, __ds_name); \
 	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
 	kmem_free(__ds_name, MAXNAMELEN); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
 #define	dprintf_ds(dd, fmt, ...)
 #endif
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DSL_DATASET_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h	(revision 275782)
@@ -1,183 +1,190 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_DIR_H
 #define	_SYS_DSL_DIR_H
 
 #include <sys/dmu.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/refcount.h>
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct dsl_dataset;
 
 /*
  * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
  * They should be of the format <reverse-dns>:<field>.
  */
 
 #define	DD_FIELD_FILESYSTEM_COUNT	"com.joyent:filesystem_count"
 #define	DD_FIELD_SNAPSHOT_COUNT		"com.joyent:snapshot_count"
 
 typedef enum dd_used {
 	DD_USED_HEAD,
 	DD_USED_SNAP,
 	DD_USED_CHILD,
 	DD_USED_CHILD_RSRV,
 	DD_USED_REFRSRV,
 	DD_USED_NUM
 } dd_used_t;
 
 #define	DD_FLAG_USED_BREAKDOWN (1<<0)
 
 typedef struct dsl_dir_phys {
 	uint64_t dd_creation_time; /* not actually used */
 	uint64_t dd_head_dataset_obj;
 	uint64_t dd_parent_obj;
 	uint64_t dd_origin_obj;
 	uint64_t dd_child_dir_zapobj;
 	/*
 	 * how much space our children are accounting for; for leaf
 	 * datasets, == physical space used by fs + snaps
 	 */
 	uint64_t dd_used_bytes;
 	uint64_t dd_compressed_bytes;
 	uint64_t dd_uncompressed_bytes;
 	/* Administrative quota setting */
 	uint64_t dd_quota;
 	/* Administrative reservation setting */
 	uint64_t dd_reserved;
 	uint64_t dd_props_zapobj;
 	uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
 	uint64_t dd_flags;
 	uint64_t dd_used_breakdown[DD_USED_NUM];
 	uint64_t dd_clones; /* dsl_dir objects */
 	uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
 } dsl_dir_phys_t;
 
 struct dsl_dir {
 	/* These are immutable; no lock needed: */
 	uint64_t dd_object;
-	dsl_dir_phys_t *dd_phys;
-	dmu_buf_t *dd_dbuf;
 	dsl_pool_t *dd_pool;
 
+	/* Stable until user eviction; no lock needed: */
+	dmu_buf_t *dd_dbuf;
+
 	/* protected by lock on pool's dp_dirty_dirs list */
 	txg_node_t dd_dirty_link;
 
 	/* protected by dp_config_rwlock */
 	dsl_dir_t *dd_parent;
 
 	/* Protected by dd_lock */
 	kmutex_t dd_lock;
 	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
 	timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
 	uint64_t dd_origin_txg;
 
 	/* gross estimate of space used by in-flight tx's */
 	uint64_t dd_tempreserved[TXG_SIZE];
 	/* amount of space we expect to write; == amount of dirty data */
 	int64_t dd_space_towrite[TXG_SIZE];
 
 	/* protected by dd_lock; keep at end of struct for better locality */
 	char dd_myname[MAXNAMELEN];
 };
+
+inline dsl_dir_phys_t *
+dsl_dir_phys(dsl_dir_t *dd)
+{
+	return (dd->dd_dbuf->db_data);
+}
 
 void dsl_dir_rele(dsl_dir_t *dd, void *tag);
 int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dir_t **, const char **tail);
 int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **);
 void dsl_dir_name(dsl_dir_t *dd, char *buf);
 int dsl_dir_namelen(dsl_dir_t *dd);
 uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
     const char *name, dmu_tx_t *tx);
 void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
 uint64_t dsl_dir_space_available(dsl_dir_t *dd,
     dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
 void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
 void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
 int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
     uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep,
     dmu_tx_t *tx);
 void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
 void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
 void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
 void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
 int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
     uint64_t quota);
 int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation);
 int dsl_dir_activate_fs_ss_limit(const char *);
 int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
     cred_t *);
 void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *);
 int dsl_dir_rename(const char *oldname, const char *newname);
 int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
     uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
 boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
     uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
 void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
 timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
 void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
     dmu_tx_t *tx);
 void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
 boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
 
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
 #define	ORIGIN_DIR_NAME "$ORIGIN"
 #define	XLATION_DIR_NAME "$XLATION"
 #define	FREE_DIR_NAME "$FREE"
 #define	LEAK_DIR_NAME "$LEAK"
 
 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 	char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \
 	    KM_SLEEP); \
 	dsl_dir_name(dd, __ds_name); \
 	dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
 	kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
 #define	dprintf_dd(dd, fmt, ...)
 #endif
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DSL_DIR_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h	(revision 275782)
@@ -1,227 +1,236 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_IMPL_H
 #define	_SYS_ZAP_IMPL_H
 
 #include <sys/zap.h>
 #include <sys/zfs_context.h>
 #include <sys/avl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 extern int fzap_default_block_shift;
 
 #define	ZAP_MAGIC 0x2F52AB2ABULL
 
 #define	FZAP_BLOCK_SHIFT(zap)	((zap)->zap_f.zap_block_shift)
 
 #define	MZAP_ENT_LEN		64
 #define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
 #define	MZAP_MAX_BLKSZ		SPA_OLD_MAXBLOCKSIZE
 
 #define	ZAP_NEED_CD		(-1U)
 
 typedef struct mzap_ent_phys {
 	uint64_t mze_value;
 	uint32_t mze_cd;
 	uint16_t mze_pad;	/* in case we want to chain them someday */
 	char mze_name[MZAP_NAME_LEN];
 } mzap_ent_phys_t;
 
 typedef struct mzap_phys {
 	uint64_t mz_block_type;	/* ZBT_MICRO */
 	uint64_t mz_salt;
 	uint64_t mz_normflags;
 	uint64_t mz_pad[5];
 	mzap_ent_phys_t mz_chunk[1];
 	/* actually variable size depending on block size */
 } mzap_phys_t;
 
 typedef struct mzap_ent {
 	avl_node_t mze_node;
 	int mze_chunkid;
 	uint64_t mze_hash;
 	uint32_t mze_cd; /* copy from mze_phys->mze_cd */
 } mzap_ent_t;
 
 #define	MZE_PHYS(zap, mze) \
-	(&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
+	(&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid])
 
 /*
  * The (fat) zap is stored in one object. It is an array of
  * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
  *
  * ptrtbl fits in first block:
  * 	[zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
  *
  * ptrtbl too big for first block:
  * 	[zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
  *
  */
 
 struct dmu_buf;
 struct zap_leaf;
 
 #define	ZBT_LEAF		((1ULL << 63) + 0)
 #define	ZBT_HEADER		((1ULL << 63) + 1)
 #define	ZBT_MICRO		((1ULL << 63) + 3)
 /* any other values are ptrtbl blocks */
 
 /*
  * the embedded pointer table takes up half a block:
  * block size / entry size (2^3) / 2
  */
 #define	ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
 
 /*
  * The embedded pointer table starts half-way through the block.  Since
  * the pointer table itself is half the block, it starts at (64-bit)
  * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
  */
 #define	ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
-	((uint64_t *)(zap)->zap_f.zap_phys) \
+	((uint64_t *)zap_f_phys(zap)) \
 	[(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
 
 /*
  * TAKE NOTE:
  * If zap_phys_t is modified, zap_byteswap() must be modified.
  */
 typedef struct zap_phys {
 	uint64_t zap_block_type;	/* ZBT_HEADER */
 	uint64_t zap_magic;		/* ZAP_MAGIC */
 
 	struct zap_table_phys {
 		uint64_t zt_blk;	/* starting block number */
 		uint64_t zt_numblks;	/* number of blocks */
 		uint64_t zt_shift;	/* bits to index it */
 		uint64_t zt_nextblk;	/* next (larger) copy start block */
 		uint64_t zt_blks_copied; /* number source blocks copied */
 	} zap_ptrtbl;
 
 	uint64_t zap_freeblk;		/* the next free block */
 	uint64_t zap_num_leafs;		/* number of leafs */
 	uint64_t zap_num_entries;	/* number of entries */
 	uint64_t zap_salt;		/* salt to stir into hash function */
 	uint64_t zap_normflags;		/* flags for u8_textprep_str() */
 	uint64_t zap_flags;		/* zap_flags_t */
 	/*
 	 * This structure is followed by padding, and then the embedded
 	 * pointer table.  The embedded pointer table takes up second
 	 * half of the block.  It is accessed using the
 	 * ZAP_EMBEDDED_PTRTBL_ENT() macro.
 	 */
 } zap_phys_t;
 
 typedef struct zap_table_phys zap_table_phys_t;
 
 typedef struct zap {
 	objset_t *zap_objset;
 	uint64_t zap_object;
 	struct dmu_buf *zap_dbuf;
 	krwlock_t zap_rwlock;
 	boolean_t zap_ismicro;
 	int zap_normflags;
 	uint64_t zap_salt;
 	union {
 		struct {
-			zap_phys_t *zap_phys;
-
 			/*
 			 * zap_num_entries_mtx protects
 			 * zap_num_entries
 			 */
 			kmutex_t zap_num_entries_mtx;
 			int zap_block_shift;
 		} zap_fat;
 		struct {
-			mzap_phys_t *zap_phys;
 			int16_t zap_num_entries;
 			int16_t zap_num_chunks;
 			int16_t zap_alloc_next;
 			avl_tree_t zap_avl;
 		} zap_micro;
 	} zap_u;
 } zap_t;
+
+inline zap_phys_t *
+zap_f_phys(zap_t *zap)
+{
+	return (zap->zap_dbuf->db_data);
+}
+
+inline mzap_phys_t *
+zap_m_phys(zap_t *zap)
+{
+	return (zap->zap_dbuf->db_data);
+}
 
 typedef struct zap_name {
 	zap_t *zn_zap;
 	int zn_key_intlen;
 	const void *zn_key_orig;
 	int zn_key_orig_numints;
 	const void *zn_key_norm;
 	int zn_key_norm_numints;
 	uint64_t zn_hash;
 	matchtype_t zn_matchtype;
 	char zn_normbuf[ZAP_MAXNAMELEN];
 } zap_name_t;
 
 #define	zap_f	zap_u.zap_fat
 #define	zap_m	zap_u.zap_micro
 
 boolean_t zap_match(zap_name_t *zn, const char *matchname);
 int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
 void zap_unlockdir(zap_t *zap);
 void zap_evict(dmu_buf_t *db, void *vmzap);
 zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
 void zap_name_free(zap_name_t *zn);
 int zap_hashbits(zap_t *zap);
 uint32_t zap_maxcd(zap_t *zap);
 uint64_t zap_getflags(zap_t *zap);
 
 #define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
 
 void fzap_byteswap(void *buf, size_t size);
 int fzap_count(zap_t *zap, uint64_t *count);
 int fzap_lookup(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     char *realname, int rn_len, boolean_t *normalization_conflictp);
 void fzap_prefetch(zap_name_t *zn);
 int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
     uint64_t *tooverwrite);
 int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
 int fzap_update(zap_name_t *zn,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
 int fzap_length(zap_name_t *zn,
     uint64_t *integer_size, uint64_t *num_integers);
 int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
 int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
 void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
 void zap_put_leaf(struct zap_leaf *l);
 
 int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
 int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_ZAP_IMPL_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h	(revision 275782)
@@ -1,242 +1,246 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_LEAF_H
 #define	_SYS_ZAP_LEAF_H
 
 #include <sys/zap.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct zap;
 struct zap_name;
 struct zap_stats;
 
 #define	ZAP_LEAF_MAGIC 0x2AB1EAF
 
 /* chunk size = 24 bytes */
 #define	ZAP_LEAF_CHUNKSIZE 24
 
 /*
  * The amount of space available for chunks is:
  * block size (1<<l->l_bs) - hash entry size (2) * number of hash
  * entries - header space (2*chunksize)
  */
 #define	ZAP_LEAF_NUMCHUNKS(l) \
 	(((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
 	ZAP_LEAF_CHUNKSIZE - 2)
 
 /*
  * The amount of space within the chunk available for the array is:
  * chunk size - space for type (1) - space for next pointer (2)
  */
 #define	ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
 
 #define	ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
 	(((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
 
 /*
  * Low water mark:  when there are only this many chunks free, start
  * growing the ptrtbl.  Ideally, this should be larger than a
  * "reasonably-sized" entry.  20 chunks is more than enough for the
  * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
  * while still being only around 3% for 16k blocks.
  */
 #define	ZAP_LEAF_LOW_WATER (20)
 
 /*
  * The leaf hash table has block size / 2^5 (32) number of entries,
  * which should be more than enough for the maximum number of entries,
  * which is less than block size / CHUNKSIZE (24) / minimum number of
  * chunks per entry (3).
  */
 #define	ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
 #define	ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
 
 /*
  * The chunks start immediately after the hash table.  The end of the
  * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
  * chunk_t.
  */
 #define	ZAP_LEAF_CHUNK(l, idx) \
 	((zap_leaf_chunk_t *) \
-	((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+	(zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
 #define	ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
 
 typedef enum zap_chunk_type {
 	ZAP_CHUNK_FREE = 253,
 	ZAP_CHUNK_ENTRY = 252,
 	ZAP_CHUNK_ARRAY = 251,
 	ZAP_CHUNK_TYPE_MAX = 250
 } zap_chunk_type_t;
 
 #define	ZLF_ENTRIES_CDSORTED (1<<0)
 
 /*
  * TAKE NOTE:
  * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
  */
 typedef struct zap_leaf_phys {
 	struct zap_leaf_header {
 		/* Public to ZAP */
 		uint64_t lh_block_type;		/* ZBT_LEAF */
 		uint64_t lh_pad1;
 		uint64_t lh_prefix;		/* hash prefix of this leaf */
 		uint32_t lh_magic;		/* ZAP_LEAF_MAGIC */
 		uint16_t lh_nfree;		/* number free chunks */
 		uint16_t lh_nentries;		/* number of entries */
 		uint16_t lh_prefix_len;		/* num bits used to id this */
 
 		/* Private to zap_leaf */
 		uint16_t lh_freelist;		/* chunk head of free list */
 		uint8_t lh_flags;		/* ZLF_* flags */
 		uint8_t lh_pad2[11];
 	} l_hdr; /* 2 24-byte chunks */
 
 	/*
 	 * The header is followed by a hash table with
 	 * ZAP_LEAF_HASH_NUMENTRIES(zap) entries.  The hash table is
 	 * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
 	 * zap_leaf_chunk structures.  These structures are accessed
 	 * with the ZAP_LEAF_CHUNK() macro.
 	 */
 
 	uint16_t l_hash[1];
 } zap_leaf_phys_t;
 
 typedef union zap_leaf_chunk {
 	struct zap_leaf_entry {
 		uint8_t le_type; 		/* always ZAP_CHUNK_ENTRY */
 		uint8_t le_value_intlen;	/* size of value's ints */
 		uint16_t le_next;		/* next entry in hash chain */
 		uint16_t le_name_chunk;		/* first chunk of the name */
 		uint16_t le_name_numints;	/* ints in name (incl null) */
 		uint16_t le_value_chunk;	/* first chunk of the value */
 		uint16_t le_value_numints;	/* value length in ints */
 		uint32_t le_cd;			/* collision differentiator */
 		uint64_t le_hash;		/* hash value of the name */
 	} l_entry;
 	struct zap_leaf_array {
 		uint8_t la_type;		/* always ZAP_CHUNK_ARRAY */
 		uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
 		uint16_t la_next;		/* next blk or CHAIN_END */
 	} l_array;
 	struct zap_leaf_free {
 		uint8_t lf_type;		/* always ZAP_CHUNK_FREE */
 		uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
 		uint16_t lf_next;	/* next in free list, or CHAIN_END */
 	} l_free;
 } zap_leaf_chunk_t;
 
 typedef struct zap_leaf {
 	krwlock_t l_rwlock;
 	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
 	int l_bs;			/* block size shift */
 	dmu_buf_t *l_dbuf;
-	zap_leaf_phys_t *l_phys;
 } zap_leaf_t;
 
+inline zap_leaf_phys_t *
+zap_leaf_phys(zap_leaf_t *l)
+{
+	return (l->l_dbuf->db_data);
+}
 
 typedef struct zap_entry_handle {
 	/* Set by zap_leaf and public to ZAP */
 	uint64_t zeh_num_integers;
 	uint64_t zeh_hash;
 	uint32_t zeh_cd;
 	uint8_t zeh_integer_size;
 
 	/* Private to zap_leaf */
 	uint16_t zeh_fakechunk;
 	uint16_t *zeh_chunkp;
 	zap_leaf_t *zeh_leaf;
 } zap_entry_handle_t;
 
 /*
  * Return a handle to the named entry, or ENOENT if not found.  The hash
  * value must equal zap_hash(name).
  */
 extern int zap_leaf_lookup(zap_leaf_t *l,
     struct zap_name *zn, zap_entry_handle_t *zeh);
 
 /*
  * Return a handle to the entry with this hash+cd, or the entry with the
  * next closest hash+cd.
  */
 extern int zap_leaf_lookup_closest(zap_leaf_t *l,
     uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
 
 /*
  * Read the first num_integers in the attribute.  Integer size
  * conversion will be done without sign extension.  Return EINVAL if
  * integer_size is too small.  Return EOVERFLOW if there are more than
  * num_integers in the attribute.
  */
 extern int zap_entry_read(const zap_entry_handle_t *zeh,
     uint8_t integer_size, uint64_t num_integers, void *buf);
 
 extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh,
     uint16_t buflen, char *buf);
 
 /*
  * Replace the value of an existing entry.
  *
  * May fail if it runs out of space (ENOSPC).
  */
 extern int zap_entry_update(zap_entry_handle_t *zeh,
     uint8_t integer_size, uint64_t num_integers, const void *buf);
 
 /*
  * Remove an entry.
  */
 extern void zap_entry_remove(zap_entry_handle_t *zeh);
 
 /*
  * Create an entry. An equal entry must not exist, and this entry must
  * belong in this leaf (according to its hash value).  Fills in the
  * entry handle on success.  Returns 0 on success or ENOSPC on failure.
  */
 extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd,
     uint8_t integer_size, uint64_t num_integers, const void *buf,
     zap_entry_handle_t *zeh);
 
 /* Determine whether there is another entry with the same normalized form. */
 extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
     struct zap_name *zn, const char *name, struct zap *zap);
 
 /*
  * Other stuff.
  */
 
 extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
 extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
 extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
 extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
     struct zap_stats *zs);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_ZAP_LEAF_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c	(revision 275782)
@@ -1,1388 +1,1386 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 /*
  * This file contains the top half of the zfs directory structure
  * implementation. The bottom half is in zap_leaf.c.
  *
  * The zdir is an extendable hash data structure. There is a table of
  * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
  * each a constant size and hold a variable number of directory entries.
  * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
  *
  * The pointer table holds a power of 2 number of pointers.
  * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
  * by the pointer at index i in the table holds entries whose hash value
  * has a zd_prefix_len - bit prefix
  */
 
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
 #include <sys/fs/zfs.h>
 #include <sys/zap.h>
 #include <sys/refcount.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 
 int fzap_default_block_shift = 14; /* 16k blocksize */
 
+extern inline zap_phys_t *zap_f_phys(zap_t *zap);
+
 static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
 
-
 void
 fzap_byteswap(void *vbuf, size_t size)
 {
 	uint64_t block_type;
 
 	block_type = *(uint64_t *)vbuf;
 
 	if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
 		zap_leaf_byteswap(vbuf, size);
 	else {
 		/* it's a ptrtbl block */
 		byteswap_uint64_array(vbuf, size);
 	}
 }
 
 void
 fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 {
 	dmu_buf_t *db;
 	zap_leaf_t *l;
 	int i;
 	zap_phys_t *zp;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	zap->zap_ismicro = FALSE;
 
-	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
-	    &zap->zap_f.zap_phys, zap_evict);
+	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, zap_evict);
 
 	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
 	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
 
-	zp = zap->zap_f.zap_phys;
+	zp = zap_f_phys(zap);
 	/*
 	 * explicitly zero it since it might be coming from an
 	 * initialized microzap
 	 */
 	bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
 	zp->zap_block_type = ZBT_HEADER;
 	zp->zap_magic = ZAP_MAGIC;
 
 	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
 
 	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
 	zp->zap_num_leafs = 1;
 	zp->zap_num_entries = 0;
 	zp->zap_salt = zap->zap_salt;
 	zp->zap_normflags = zap->zap_normflags;
 	zp->zap_flags = flags;
 
 	/* block 1 will be the first leaf */
 	for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
 		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
 
 	/*
 	 * set up block 1 - the first leaf
 	 */
 	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db, tx);
 
 	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 	l->l_dbuf = db;
-	l->l_phys = db->db_data;
 
 	zap_leaf_init(l, zp->zap_normflags != 0);
 
 	kmem_free(l, sizeof (zap_leaf_t));
 	dmu_buf_rele(db, FTAG);
 }
 
 static int
 zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
 {
 	if (RW_WRITE_HELD(&zap->zap_rwlock))
 		return (1);
 	if (rw_tryupgrade(&zap->zap_rwlock)) {
 		dmu_buf_will_dirty(zap->zap_dbuf, tx);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Generic routines for dealing with the pointer & cookie tables.
  */
 
 static int
 zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
     void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
     dmu_tx_t *tx)
 {
 	uint64_t b, newblk;
 	dmu_buf_t *db_old, *db_new;
 	int err;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	int hepb = 1<<(bs-4);
 	/* hepb = half the number of entries in a block */
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	ASSERT(tbl->zt_blk != 0);
 	ASSERT(tbl->zt_numblks > 0);
 
 	if (tbl->zt_nextblk != 0) {
 		newblk = tbl->zt_nextblk;
 	} else {
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
 		dmu_prefetch(zap->zap_objset, zap->zap_object,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs);
 	}
 
 	/*
 	 * Copy the ptrtbl from the old to new location.
 	 */
 
 	b = tbl->zt_blks_copied;
 	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 	if (err)
 		return (err);
 
 	/* first half of entries in old[b] go to new[2*b+0] */
 	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
 	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
 	    db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	dmu_buf_rele(db_old, FTAG);
 
 	tbl->zt_blks_copied++;
 
 	dprintf("copied block %llu of %llu\n",
 	    tbl->zt_blks_copied, tbl->zt_numblks);
 
 	if (tbl->zt_blks_copied == tbl->zt_numblks) {
 		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
 
 		tbl->zt_blk = newblk;
 		tbl->zt_numblks *= 2;
 		tbl->zt_shift++;
 		tbl->zt_nextblk = 0;
 		tbl->zt_blks_copied = 0;
 
 		dprintf("finished; numblocks now %llu (%lluk entries)\n",
 		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
 	}
 
 	return (0);
 }
 
 static int
 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
     dmu_tx_t *tx)
 {
 	int err;
 	uint64_t blk, off;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	dmu_buf_t *db;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(tbl->zt_blk != 0);
 
 	dprintf("storing %llx at index %llx\n", val, idx);
 
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);
 
 	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err)
 		return (err);
 	dmu_buf_will_dirty(db, tx);
 
 	if (tbl->zt_nextblk != 0) {
 		uint64_t idx2 = idx * 2;
 		uint64_t blk2 = idx2 >> (bs-3);
 		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 		dmu_buf_t *db2;
 
 		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 		    DMU_READ_NO_PREFETCH);
 		if (err) {
 			dmu_buf_rele(db, FTAG);
 			return (err);
 		}
 		dmu_buf_will_dirty(db2, tx);
 		((uint64_t *)db2->db_data)[off2] = val;
 		((uint64_t *)db2->db_data)[off2+1] = val;
 		dmu_buf_rele(db2, FTAG);
 	}
 
 	((uint64_t *)db->db_data)[off] = val;
 	dmu_buf_rele(db, FTAG);
 
 	return (0);
 }
 
 static int
 zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 {
 	uint64_t blk, off;
 	int err;
 	dmu_buf_t *db;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);
 
 	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
 	dmu_buf_rele(db, FTAG);
 
 	if (tbl->zt_nextblk != 0) {
 		/*
 		 * read the nextblk for the sake of i/o error checking,
 		 * so that zap_table_load() will catch errors for
 		 * zap_table_store.
 		 */
 		blk = (idx*2) >> (bs-3);
 
 		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
 	return (err);
 }
 
 /*
  * Routines for growing the ptrtbl.
  */
 
 static void
 zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
 {
 	int i;
 	for (i = 0; i < n; i++) {
 		uint64_t lb = src[i];
 		dst[2*i+0] = lb;
 		dst[2*i+1] = lb;
 	}
 }
 
 static int
 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 {
 	/*
 	 * The pointer table should never use more hash bits than we
 	 * have (otherwise we'd be using useless zero bits to index it).
 	 * If we are within 2 bits of running out, stop growing, since
 	 * this is already an aberrant condition.
 	 */
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
 		return (SET_ERROR(ENOSPC));
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		/*
 		 * We are outgrowing the "embedded" ptrtbl (the one
 		 * stored in the header block).  Give it its own entire
 		 * block, which will double the size of the ptrtbl.
 		 */
 		uint64_t newblk;
 		dmu_buf_t *db_new;
 		int err;
 
-		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		ASSERT0(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk);
+		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
 
 		newblk = zap_allocate_blocks(zap, 1);
 		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 		    DMU_READ_NO_PREFETCH);
 		if (err)
 			return (err);
 		dmu_buf_will_dirty(db_new, tx);
 		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 		dmu_buf_rele(db_new, FTAG);
 
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+		zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
+		zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
+		zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
 
-		ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+		ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
 		    (FZAP_BLOCK_SHIFT(zap)-3));
 
 		return (0);
 	} else {
-		return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    zap_ptrtbl_transfer, tx));
 	}
 }
 
 static void
 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 {
 	dmu_buf_will_dirty(zap->zap_dbuf, tx);
 	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
-	ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
-	zap->zap_f.zap_phys->zap_num_entries += delta;
+	ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
+	zap_f_phys(zap)->zap_num_entries += delta;
 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 }
 
 static uint64_t
 zap_allocate_blocks(zap_t *zap, int nblocks)
 {
 	uint64_t newblk;
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	newblk = zap->zap_f.zap_phys->zap_freeblk;
-	zap->zap_f.zap_phys->zap_freeblk += nblocks;
+	newblk = zap_f_phys(zap)->zap_freeblk;
+	zap_f_phys(zap)->zap_freeblk += nblocks;
 	return (newblk);
 }
 
 static zap_leaf_t *
 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 {
 	void *winner;
 	zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	rw_init(&l->l_rwlock, 0, 0, 0);
 	rw_enter(&l->l_rwlock, RW_WRITER);
 	l->l_blkid = zap_allocate_blocks(zap, 1);
 	l->l_dbuf = NULL;
-	l->l_phys = NULL;
 
 	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 	    DMU_READ_NO_PREFETCH));
-	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+	winner = dmu_buf_set_user(l->l_dbuf, l, zap_leaf_pageout);
 	ASSERT(winner == NULL);
 	dmu_buf_will_dirty(l->l_dbuf, tx);
 
 	zap_leaf_init(l, zap->zap_normflags != 0);
 
-	zap->zap_f.zap_phys->zap_num_leafs++;
+	zap_f_phys(zap)->zap_num_leafs++;
 
 	return (l);
 }
 
 int
 fzap_count(zap_t *zap, uint64_t *count)
 {
 	ASSERT(!zap->zap_ismicro);
 	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
-	*count = zap->zap_f.zap_phys->zap_num_entries;
+	*count = zap_f_phys(zap)->zap_num_entries;
 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 	return (0);
 }
 
 /*
  * Routines for obtaining zap_leaf_t's
  */
 
 void
 zap_put_leaf(zap_leaf_t *l)
 {
 	rw_exit(&l->l_rwlock);
 	dmu_buf_rele(l->l_dbuf, NULL);
 }
 
 _NOTE(ARGSUSED(0))
 static void
 zap_leaf_pageout(dmu_buf_t *db, void *vl)
 {
 	zap_leaf_t *l = vl;
 
 	rw_destroy(&l->l_rwlock);
 	kmem_free(l, sizeof (zap_leaf_t));
 }
 
 static zap_leaf_t *
 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 {
 	zap_leaf_t *l, *winner;
 
 	ASSERT(blkid != 0);
 
 	l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
 	rw_init(&l->l_rwlock, 0, 0, 0);
 	rw_enter(&l->l_rwlock, RW_WRITER);
 	l->l_blkid = blkid;
 	l->l_bs = highbit64(db->db_size) - 1;
 	l->l_dbuf = db;
-	l->l_phys = NULL;
 
-	winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+	winner = dmu_buf_set_user(db, l, zap_leaf_pageout);
 
 	rw_exit(&l->l_rwlock);
 	if (winner != NULL) {
 		/* someone else set it first */
 		zap_leaf_pageout(NULL, l);
 		l = winner;
 	}
 
 	/*
 	 * lhr_pad was previously used for the next leaf in the leaf
 	 * chain.  There should be no chained leafs (as we have removed
 	 * support for them).
 	 */
-	ASSERT0(l->l_phys->l_hdr.lh_pad1);
+	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 
 	/*
 	 * There should be more hash entries than there can be
 	 * chunks to put in the hash table
 	 */
 	ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
 
 	/* The chunks should begin at the end of the hash table */
 	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
-	    &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+	    &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 
 	/* The chunks should end at the end of the block */
 	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
-	    (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
+	    (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
 
 	return (l);
 }
 
 static int
 zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
     zap_leaf_t **lp)
 {
 	dmu_buf_t *db;
 	zap_leaf_t *l;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	int err;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
 	if (err)
 		return (err);
 
 	ASSERT3U(db->db_object, ==, zap->zap_object);
 	ASSERT3U(db->db_offset, ==, blkid << bs);
 	ASSERT3U(db->db_size, ==, 1 << bs);
 	ASSERT(blkid != 0);
 
 	l = dmu_buf_get_user(db);
 
 	if (l == NULL)
 		l = zap_open_leaf(blkid, db);
 
 	rw_enter(&l->l_rwlock, lt);
 	/*
-	 * Must lock before dirtying, otherwise l->l_phys could change,
+	 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
 	 * causing ASSERT below to fail.
 	 */
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
 	ASSERT3U(l->l_blkid, ==, blkid);
 	ASSERT3P(l->l_dbuf, ==, db);
-	ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
-	ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	*lp = l;
 	return (0);
 }
 
 static int
 zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
 {
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		ASSERT3U(idx, <,
-		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+		    (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
 		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
 		return (0);
 	} else {
-		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    idx, valp));
 	}
 }
 
 static int
 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 {
 	ASSERT(tx != NULL);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
 		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
 		return (0);
 	} else {
-		return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    idx, blk, tx));
 	}
 }
 
 static int
 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 {
 	uint64_t idx, blk;
 	int err;
 
 	ASSERT(zap->zap_dbuf == NULL ||
-	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
-	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
-	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
+	ASSERT3U(zap_f_phys(zap)->zap_magic, ==, ZAP_MAGIC);
+	idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	err = zap_idx_to_blk(zap, idx, &blk);
 	if (err != 0)
 		return (err);
 	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 
-	ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
-	    (*lp)->l_phys->l_hdr.lh_prefix);
+	ASSERT(err ||
+	    ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
+	    zap_leaf_phys(*lp)->l_hdr.lh_prefix);
 	return (err);
 }
 
 static int
 zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
 	zap_leaf_t *nl;
 	int prefix_diff, i, err;
 	uint64_t sibling;
-	int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
-	ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    l->l_phys->l_hdr.lh_prefix);
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
 	if (zap_tryupgradedir(zap, tx) == 0 ||
-	    old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 		/* We failed to upgrade, or need to grow the pointer table */
 		objset_t *os = zap->zap_objset;
 		uint64_t object = zap->zap_object;
 
 		zap_put_leaf(l);
 		zap_unlockdir(zap);
 		err = zap_lockdir(os, object, tx, RW_WRITER,
 		    FALSE, FALSE, &zn->zn_zap);
 		zap = zn->zn_zap;
 		if (err)
 			return (err);
 		ASSERT(!zap->zap_ismicro);
 
 		while (old_prefix_len ==
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 			err = zap_grow_ptrtbl(zap, tx);
 			if (err)
 				return (err);
 		}
 
 		err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
 		if (err)
 			return (err);
 
-		if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
 			/* it split while our locks were down */
 			*lp = l;
 			return (0);
 		}
 	}
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    l->l_phys->l_hdr.lh_prefix);
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
-	prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+	prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    (old_prefix_len + 1);
 	sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 
 	/* check for i/o errors before doing zap_leaf_split */
 	for (i = 0; i < (1ULL<<prefix_diff); i++) {
 		uint64_t blk;
 		err = zap_idx_to_blk(zap, sibling+i, &blk);
 		if (err)
 			return (err);
 		ASSERT3U(blk, ==, l->l_blkid);
 	}
 
 	nl = zap_create_leaf(zap, tx);
 	zap_leaf_split(l, nl, zap->zap_normflags != 0);
 
 	/* set sibling pointers */
 	for (i = 0; i < (1ULL << prefix_diff); i++) {
 		err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
 		ASSERT0(err); /* we checked for i/o errors above */
 	}
 
-	if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
+	if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
 		/* we want the sibling */
 		zap_put_leaf(l);
 		*lp = nl;
 	} else {
 		zap_put_leaf(nl);
 		*lp = l;
 	}
 
 	return (0);
 }
 
 static void
 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 {
 	zap_t *zap = zn->zn_zap;
-	int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-	int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
-	    l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
+	    zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
 
 	zap_put_leaf(l);
 
-	if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
+	if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
 		int err;
 
 		/*
 		 * We are in the middle of growing the pointer table, or
 		 * this leaf will soon make us grow it.
 		 */
 		if (zap_tryupgradedir(zap, tx) == 0) {
 			objset_t *os = zap->zap_objset;
 			uint64_t zapobj = zap->zap_object;
 
 			zap_unlockdir(zap);
 			err = zap_lockdir(os, zapobj, tx,
 			    RW_WRITER, FALSE, FALSE, &zn->zn_zap);
 			zap = zn->zn_zap;
 			if (err)
 				return;
 		}
 
 		/* could have finished growing while our locks were down */
-		if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
+		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
 			(void) zap_grow_ptrtbl(zap, tx);
 	}
 }
 
 static int
 fzap_checkname(zap_name_t *zn)
 {
 	if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	return (0);
 }
 
 static int
 fzap_checksize(uint64_t integer_size, uint64_t num_integers)
 {
 	/* Only integer sizes supported by C */
 	switch (integer_size) {
 	case 1:
 	case 2:
 	case 4:
 	case 8:
 		break;
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (integer_size * num_integers > ZAP_MAXVALUELEN)
 		return (E2BIG);
 
 	return (0);
 }
 
 static int
 fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
 {
 	int err;
 
 	if ((err = fzap_checkname(zn)) != 0)
 		return (err);
 	return (fzap_checksize(integer_size, num_integers));
 }
 
 /*
  * Routines for manipulating attributes.
  */
 int
 fzap_lookup(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     char *realname, int rn_len, boolean_t *ncp)
 {
 	zap_leaf_t *l;
 	int err;
 	zap_entry_handle_t zeh;
 
 	if ((err = fzap_checkname(zn)) != 0)
 		return (err);
 
 	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 	if (err != 0)
 		return (err);
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
 		if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
 			zap_put_leaf(l);
 			return (err);
 		}
 
 		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
 		(void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
 		if (ncp) {
 			*ncp = zap_entry_normalization_conflict(&zeh,
 			    zn, NULL, zn->zn_zap);
 		}
 	}
 
 	zap_put_leaf(l);
 	return (err);
 }
 
 int
 fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
 	zap_entry_handle_t zeh;
 	zap_t *zap = zn->zn_zap;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(!zap->zap_ismicro);
 	ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
 
 	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
 		return (err);
 retry:
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
 		err = SET_ERROR(EEXIST);
 		goto out;
 	}
 	if (err != ENOENT)
 		goto out;
 
 	err = zap_entry_create(l, zn, cd,
 	    integer_size, num_integers, val, &zeh);
 
 	if (err == 0) {
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
 
 out:
 	if (zap != NULL)
 		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 	return (err);
 }
 
 int
 fzap_add(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
 	int err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	return (fzap_add_cd(zn, integer_size, num_integers,
 	    val, ZAP_NEED_CD, tx));
 }
 
 int
 fzap_update(zap_name_t *zn,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err, create;
 	zap_entry_handle_t zeh;
 	zap_t *zap = zn->zn_zap;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
 		return (err);
 retry:
 	err = zap_leaf_lookup(l, zn, &zeh);
 	create = (err == ENOENT);
 	ASSERT(err == 0 || err == ENOENT);
 
 	if (create) {
 		err = zap_entry_create(l, zn, ZAP_NEED_CD,
 		    integer_size, num_integers, val, &zeh);
 		if (err == 0)
 			zap_increment_num_entries(zap, 1, tx);
 	} else {
 		err = zap_entry_update(&zeh, integer_size, num_integers, val);
 	}
 
 	if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
 
 	if (zap != NULL)
 		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 	return (err);
 }
 
 int
 fzap_length(zap_name_t *zn,
     uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_leaf_t *l;
 	int err;
 	zap_entry_handle_t zeh;
 
 	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 	if (err != 0)
 		return (err);
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err != 0)
 		goto out;
 
 	if (integer_size)
 		*integer_size = zeh.zeh_integer_size;
 	if (num_integers)
 		*num_integers = zeh.zeh_num_integers;
 out:
 	zap_put_leaf(l);
 	return (err);
 }
 
 int
 fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
 	zap_entry_handle_t zeh;
 
 	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
 		return (err);
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
 		zap_entry_remove(&zeh);
 		zap_increment_num_entries(zn->zn_zap, -1, tx);
 	}
 	zap_put_leaf(l);
 	return (err);
 }
 
 void
 fzap_prefetch(zap_name_t *zn)
 {
 	uint64_t idx, blk;
 	zap_t *zap = zn->zn_zap;
 	int bs;
 
 	idx = ZAP_HASH_IDX(zn->zn_hash,
-	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	    zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	bs = FZAP_BLOCK_SHIFT(zap);
 	dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
 }
 
 /*
  * Helper functions for consumers.
  */
 
 uint64_t
 zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
     const char *name, dmu_tx_t *tx)
 {
 	uint64_t new_obj;
 
 	VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
 	VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
 	    tx) == 0);
 
 	return (new_obj);
 }
 
 int
 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
     char *name)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za;
 	int err;
 
 	if (mask == 0)
 		mask = -1ULL;
 
 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 	for (zap_cursor_init(&zc, os, zapobj);
 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if ((za->za_first_integer & mask) == (value & mask)) {
 			(void) strcpy(name, za->za_name);
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 	kmem_free(za, sizeof (zap_attribute_t));
 	return (err);
 }
 
 int
 zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int err;
 
 	err = 0;
 	for (zap_cursor_init(&zc, os, fromobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		if (za.za_integer_length != 8 || za.za_num_integers != 1) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
 		err = zap_add(os, intoobj, za.za_name,
 		    8, 1, &za.za_first_integer, tx);
 		if (err)
 			break;
 	}
 	zap_cursor_fini(&zc);
 	return (err);
 }
 
 int
 zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
     uint64_t value, dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int err;
 
 	err = 0;
 	for (zap_cursor_init(&zc, os, fromobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		if (za.za_integer_length != 8 || za.za_num_integers != 1) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
 		err = zap_add(os, intoobj, za.za_name,
 		    8, 1, &value, tx);
 		if (err)
 			break;
 	}
 	zap_cursor_fini(&zc);
 	return (err);
 }
 
 int
 zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
     dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int err;
 
 	err = 0;
 	for (zap_cursor_init(&zc, os, fromobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		uint64_t delta = 0;
 
 		if (za.za_integer_length != 8 || za.za_num_integers != 1) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
 
 		err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
 		if (err != 0 && err != ENOENT)
 			break;
 		delta += za.za_first_integer;
 		err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
 		if (err)
 			break;
 	}
 	zap_cursor_fini(&zc);
 	return (err);
 }
 
 int
 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
 	return (zap_add(os, obj, name, 8, 1, &value, tx));
 }
 
 int
 zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
 	return (zap_remove(os, obj, name, tx));
 }
 
 int
 zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
 	return (zap_lookup(os, obj, name, 8, 1, &value));
 }
 
 int
 zap_add_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	return (zap_add(os, obj, name, 8, 1, &value, tx));
 }
 
 int
 zap_update_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	return (zap_update(os, obj, name, 8, 1, &value, tx));
 }
 
 int
 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	return (zap_lookup(os, obj, name, 8, 1, valuep));
 }
 
 int
 zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
     dmu_tx_t *tx)
 {
 	uint64_t value = 0;
 	int err;
 
 	if (delta == 0)
 		return (0);
 
 	err = zap_lookup(os, obj, name, 8, 1, &value);
 	if (err != 0 && err != ENOENT)
 		return (err);
 	value += delta;
 	if (value == 0)
 		err = zap_remove(os, obj, name, tx);
 	else
 		err = zap_update(os, obj, name, 8, 1, &value, tx);
 	return (err);
 }
 
 int
 zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
     dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	return (zap_increment(os, obj, name, delta, tx));
 }
 
 /*
  * Routines for iterating over the attributes.
  */
 
 int
 fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 {
 	int err = ENOENT;
 	zap_entry_handle_t zeh;
 	zap_leaf_t *l;
 
 	/* retrieve the next entry at or after zc_hash/zc_cd */
 	/* if no entry, return ENOENT */
 
 	if (zc->zc_leaf &&
 	    (ZAP_HASH_IDX(zc->zc_hash,
-	    zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
-	    zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
+	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 		zap_put_leaf(zc->zc_leaf);
 		zc->zc_leaf = NULL;
 	}
 
 again:
 	if (zc->zc_leaf == NULL) {
 		err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
 		    &zc->zc_leaf);
 		if (err != 0)
 			return (err);
 	} else {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 	}
 	l = zc->zc_leaf;
 
 	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
 
 	if (err == ENOENT) {
 		uint64_t nocare =
-		    (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
+		    (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
 		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
 		zc->zc_cd = 0;
-		if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
+		    zc->zc_hash == 0) {
 			zc->zc_hash = -1ULL;
 		} else {
 			zap_put_leaf(zc->zc_leaf);
 			zc->zc_leaf = NULL;
 			goto again;
 		}
 	}
 
 	if (err == 0) {
 		zc->zc_hash = zeh.zeh_hash;
 		zc->zc_cd = zeh.zeh_cd;
 		za->za_integer_length = zeh.zeh_integer_size;
 		za->za_num_integers = zeh.zeh_num_integers;
 		if (zeh.zeh_num_integers == 0) {
 			za->za_first_integer = 0;
 		} else {
 			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
 			ASSERT(err == 0 || err == EOVERFLOW);
 		}
 		err = zap_entry_read_name(zap, &zeh,
 		    sizeof (za->za_name), za->za_name);
 		ASSERT(err == 0);
 
 		za->za_normalization_conflict =
 		    zap_entry_normalization_conflict(&zeh,
 		    NULL, za->za_name, zap);
 	}
 	rw_exit(&zc->zc_leaf->l_rwlock);
 	return (err);
 }
 
 static void
 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 {
 	int i, err;
 	uint64_t lastblk = 0;
 
 	/*
 	 * NB: if a leaf has more pointers than an entire ptrtbl block
 	 * can hold, then it'll be accounted for more than once, since
 	 * we won't have lastblk.
 	 */
 	for (i = 0; i < len; i++) {
 		zap_leaf_t *l;
 
 		if (tbl[i] == lastblk)
 			continue;
 		lastblk = tbl[i];
 
 		err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
 		if (err == 0) {
 			zap_leaf_stats(zap, l, zs);
 			zap_put_leaf(l);
 		}
 	}
 }
 
 int
 fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn)
 {
 	int err;
 	zap_leaf_t *l;
 	zap_entry_handle_t zeh;
 
 	if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
 	if (err != 0)
 		return (err);
 
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err != 0)
 		return (err);
 
 	zc->zc_leaf = l;
 	zc->zc_hash = zeh.zeh_hash;
 	zc->zc_cd = zeh.zeh_cd;
 
 	return (err);
 }
 
 void
 fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 {
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	zs->zs_blocksize = 1ULL << bs;
 
 	/*
 	 * Set zap_phys_t fields
 	 */
-	zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
-	zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
-	zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
-	zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
-	zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
-	zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
+	zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
+	zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
+	zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
+	zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
+	zs->zs_magic = zap_f_phys(zap)->zap_magic;
+	zs->zs_salt = zap_f_phys(zap)->zap_salt;
 
 	/*
 	 * Set zap_ptrtbl fields
 	 */
-	zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-	zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
+	zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
 	zs->zs_ptrtbl_blks_copied =
-	    zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
-	zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
-	zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
-	zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+	    zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
+	zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
+	zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+	zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		/* the ptrtbl is entirely in the header block. */
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 	} else {
 		int b;
 
 		dmu_prefetch(zap->zap_objset, zap->zap_object,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
+		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs);
 
-		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+		for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 		    b++) {
 			dmu_buf_t *db;
 			int err;
 
 			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
 			    FTAG, &db, DMU_READ_NO_PREFETCH);
 			if (err == 0) {
 				zap_stats_ptrtbl(zap, db->db_data,
 				    1<<(bs-3), zs);
 				dmu_buf_rele(db, FTAG);
 			}
 		}
 	}
 }
 
 int
 fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
     uint64_t *tooverwrite)
 {
 	zap_t *zap = zn->zn_zap;
 	zap_leaf_t *l;
 	int err;
 
 	/*
 	 * Account for the header block of the fatzap.
 	 */
 	if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
 		*tooverwrite += zap->zap_dbuf->db_size;
 	} else {
 		*towrite += zap->zap_dbuf->db_size;
 	}
 
 	/*
 	 * Account for the pointer table blocks.
 	 * If we are adding we need to account for the following cases :
 	 * - If the pointer table is embedded, this operation could force an
 	 *   external pointer table.
 	 * - If this already has an external pointer table this operation
 	 *   could extend the table.
 	 */
 	if (add) {
-		if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
+		if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0)
 			*towrite += zap->zap_dbuf->db_size;
 		else
 			*towrite += (zap->zap_dbuf->db_size * 3);
 	}
 
 	/*
 	 * Now, check if the block containing leaf is freeable
 	 * and account accordingly.
 	 */
 	err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
 	if (err != 0) {
 		return (err);
 	}
 
 	if (!add && dmu_buf_freeable(l->l_dbuf)) {
 		*tooverwrite += l->l_dbuf->db_size;
 	} else {
 		/*
 		 * If this an add operation, the leaf block could split.
 		 * Hence, we need to account for an additional leaf block.
 		 */
 		*towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
 	}
 
 	zap_put_leaf(l);
 	return (0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c	(revision 275782)
@@ -1,873 +1,884 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
 /*
  * The 512-byte leaf is broken into 32 16-byte chunks.
  * chunk number n means l_chunk[n], even though the header precedes it.
  * the names are stored null-terminated.
  */
 
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/arc.h>
 
 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
 #define	CHAIN_END 0xffff /* end of the chunk chain */
 
 /* half the (current) minimum block size */
 #define	MAX_ARRAY_BYTES (8<<10)
 
 #define	LEAF_HASH(l, h) \
 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
-	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
+	((h) >> \
+	(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
 
-#define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+#define	LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
+extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
 
 static void
 zap_memset(void *a, int c, size_t n)
 {
 	char *cp = a;
 	char *cpend = cp + n;
 
 	while (cp < cpend)
 		*cp++ = c;
 }
 
 static void
 stv(int len, void *addr, uint64_t value)
 {
 	switch (len) {
 	case 1:
 		*(uint8_t *)addr = value;
 		return;
 	case 2:
 		*(uint16_t *)addr = value;
 		return;
 	case 4:
 		*(uint32_t *)addr = value;
 		return;
 	case 8:
 		*(uint64_t *)addr = value;
 		return;
 	}
 	ASSERT(!"bad int len");
 }
 
 static uint64_t
 ldv(int len, const void *addr)
 {
 	switch (len) {
 	case 1:
 		return (*(uint8_t *)addr);
 	case 2:
 		return (*(uint16_t *)addr);
 	case 4:
 		return (*(uint32_t *)addr);
 	case 8:
 		return (*(uint64_t *)addr);
 	}
 	ASSERT(!"bad int len");
 	return (0xFEEDFACEDEADBEEFULL);
 }
 
 void
 zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 {
 	int i;
 	zap_leaf_t l;
+	dmu_buf_t l_dbuf;
+
+	l_dbuf.db_data = buf;
 	l.l_bs = highbit64(size) - 1;
-	l.l_phys = buf;
+	l.l_dbuf = &l_dbuf;
 
 	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
 	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
 	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
 	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
 	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
 
 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
 		struct zap_leaf_entry *le;
 
 		switch (lc->l_free.lf_type) {
 		case ZAP_CHUNK_ENTRY:
 			le = &lc->l_entry;
 
 			le->le_type =		BSWAP_8(le->le_type);
 			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
 			le->le_next =		BSWAP_16(le->le_next);
 			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
 			le->le_name_numints =	BSWAP_16(le->le_name_numints);
 			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
 			le->le_value_numints =	BSWAP_16(le->le_value_numints);
 			le->le_cd =		BSWAP_32(le->le_cd);
 			le->le_hash =		BSWAP_64(le->le_hash);
 			break;
 		case ZAP_CHUNK_FREE:
 			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
 			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
 			break;
 		case ZAP_CHUNK_ARRAY:
 			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
 			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
 			/* la_array doesn't need swapping */
 			break;
 		default:
 			ASSERT(!"bad leaf type");
 		}
 	}
 }
 
 void
 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
 	int i;
 
 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
-	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
-	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+	    sizeof (struct zap_leaf_header));
+	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
-	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
-	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
-	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+	zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
+	zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+	zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
 	if (sort)
-		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 }
 
 /*
  * Routines which manipulate leaf chunks (l_chunk[]).
  */
 
 static uint16_t
 zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
 	int chunk;
 
-	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
+	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-	chunk = l->l_phys->l_hdr.lh_freelist;
+	chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
-	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+	zap_leaf_phys(l)->l_hdr.lh_freelist =
+	    ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
 
-	l->l_phys->l_hdr.lh_nfree--;
+	zap_leaf_phys(l)->l_hdr.lh_nfree--;
 
 	return (chunk);
 }
 
 static void
 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
 {
 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
-	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
 
 	zlf->lf_type = ZAP_CHUNK_FREE;
-	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
-	l->l_phys->l_hdr.lh_freelist = chunk;
+	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
 
-	l->l_phys->l_hdr.lh_nfree++;
+	zap_leaf_phys(l)->l_hdr.lh_nfree++;
 }
 
 /*
  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
  */
 
 static uint16_t
 zap_leaf_array_create(zap_leaf_t *l, const char *buf,
     int integer_size, int num_integers)
 {
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
 	int byten = 0;
 	uint64_t value = 0;
 	int shift = (integer_size-1)*8;
 	int len = num_integers;
 
 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
 
 	while (len > 0) {
 		uint16_t chunk = zap_leaf_chunk_alloc(l);
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int i;
 
 		la->la_type = ZAP_CHUNK_ARRAY;
 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
 			if (byten == 0)
 				value = ldv(integer_size, buf);
 			la->la_array[i] = value >> shift;
 			value <<= 8;
 			if (++byten == integer_size) {
 				byten = 0;
 				buf += integer_size;
 				if (--len == 0)
 					break;
 			}
 		}
 
 		*chunkp = chunk;
 		chunkp = &la->la_next;
 	}
 	*chunkp = CHAIN_END;
 
 	return (chunk_head);
 }
 
 static void
 zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 {
 	uint16_t chunk = *chunkp;
 
 	*chunkp = CHAIN_END;
 
 	while (chunk != CHAIN_END) {
 		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
 		    ZAP_CHUNK_ARRAY);
 		zap_leaf_chunk_free(l, chunk);
 		chunk = nextchunk;
 	}
 }
 
 /* array_len and buf_len are in integers, not bytes */
 static void
 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
     void *buf)
 {
 	int len = MIN(array_len, buf_len);
 	int byten = 0;
 	uint64_t value = 0;
 	char *p = buf;
 
 	ASSERT3U(array_int_len, <=, buf_int_len);
 
 	/* Fast path for one 8-byte integer */
 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		uint8_t *ip = la->la_array;
 		uint64_t *buf64 = buf;
 
 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
 		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
 		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
 		return;
 	}
 
 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
 	if (array_int_len == 1 && buf_int_len == 1 &&
 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_array *la =
 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
 			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
 			p += ZAP_LEAF_ARRAY_BYTES;
 			chunk = la->la_next;
 		}
 		return;
 	}
 
 	while (len > 0) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int i;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
 			value = (value << 8) | la->la_array[i];
 			byten++;
 			if (byten == array_int_len) {
 				stv(buf_int_len, p, value);
 				byten = 0;
 				len--;
 				if (len == 0)
 					return;
 				p += buf_int_len;
 			}
 		}
 		chunk = la->la_next;
 	}
 }
 
 static boolean_t
 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
     int chunk, int array_numints)
 {
 	int bseen = 0;
 
 	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
 		uint64_t *thiskey;
 		boolean_t match;
 
 		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
 		thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
 		    KM_SLEEP);
 
 		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
 		    sizeof (*thiskey), array_numints, thiskey);
 		match = bcmp(thiskey, zn->zn_key_orig,
 		    array_numints * sizeof (*thiskey)) == 0;
 		kmem_free(thiskey, array_numints * sizeof (*thiskey));
 		return (match);
 	}
 
 	ASSERT(zn->zn_key_intlen == 1);
 	if (zn->zn_matchtype == MT_FIRST) {
 		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
 		boolean_t match;
 
 		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
 		    sizeof (char), array_numints, thisname);
 		match = zap_match(zn, thisname);
 		kmem_free(thisname, array_numints);
 		return (match);
 	}
 
 	/*
 	 * Fast path for exact matching.
 	 * First check that the lengths match, so that we don't read
 	 * past the end of the zn_key_orig array.
 	 */
 	if (array_numints != zn->zn_key_orig_numints)
 		return (B_FALSE);
 	while (bseen < array_numints) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
 			break;
 		chunk = la->la_next;
 		bseen += toread;
 	}
 	return (bseen == array_numints);
 }
 
 /*
  * Routines which manipulate leaf entries.
  */
 
 int
 zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
 {
 	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
 
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 again:
 	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
 		uint16_t chunk = *chunkp;
 		le = ZAP_LEAF_ENTRY(l, chunk);
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 		if (le->le_hash != zn->zn_hash)
 			continue;
 
 		/*
 		 * NB: the entry chain is always sorted by cd on
 		 * normalized zap objects, so this will find the
 		 * lowest-cd match for MT_FIRST.
 		 */
 		ASSERT(zn->zn_matchtype == MT_EXACT ||
-		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+		    (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
 		    le->le_name_numints)) {
 			zeh->zeh_num_integers = le->le_value_numints;
 			zeh->zeh_integer_size = le->le_value_intlen;
 			zeh->zeh_cd = le->le_cd;
 			zeh->zeh_hash = le->le_hash;
 			zeh->zeh_chunkp = chunkp;
 			zeh->zeh_leaf = l;
 			return (0);
 		}
 	}
 
 	/*
 	 * NB: we could of course do this in one pass, but that would be
 	 * a pain.  We'll see if MT_BEST is even used much.
 	 */
 	if (zn->zn_matchtype == MT_BEST) {
 		zn->zn_matchtype = MT_FIRST;
 		goto again;
 	}
 
 	return (SET_ERROR(ENOENT));
 }
 
 /* Return (h1,cd1 >= h2,cd2) */
 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
 
 int
 zap_leaf_lookup_closest(zap_leaf_t *l,
     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
 {
 	uint16_t chunk;
 	uint64_t besth = -1ULL;
 	uint32_t bestcd = -1U;
 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
 	uint16_t lh;
 	struct zap_leaf_entry *le;
 
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
-		for (chunk = l->l_phys->l_hash[lh];
+		for (chunk = zap_leaf_phys(l)->l_hash[lh];
 		    chunk != CHAIN_END; chunk = le->le_next) {
 			le = ZAP_LEAF_ENTRY(l, chunk);
 
 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
 				ASSERT3U(bestlh, >=, lh);
 				bestlh = lh;
 				besth = le->le_hash;
 				bestcd = le->le_cd;
 
 				zeh->zeh_num_integers = le->le_value_numints;
 				zeh->zeh_integer_size = le->le_value_intlen;
 				zeh->zeh_cd = le->le_cd;
 				zeh->zeh_hash = le->le_hash;
 				zeh->zeh_fakechunk = chunk;
 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
 				zeh->zeh_leaf = l;
 			}
 		}
 	}
 
 	return (bestcd == -1U ? ENOENT : 0);
 }
 
 int
 zap_entry_read(const zap_entry_handle_t *zeh,
     uint8_t integer_size, uint64_t num_integers, void *buf)
 {
 	struct zap_leaf_entry *le =
 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	if (le->le_value_intlen > integer_size)
 		return (SET_ERROR(EINVAL));
 
 	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
 	    le->le_value_intlen, le->le_value_numints,
 	    integer_size, num_integers, buf);
 
 	if (zeh->zeh_num_integers > num_integers)
 		return (SET_ERROR(EOVERFLOW));
 	return (0);
 
 }
 
 int
 zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
     char *buf)
 {
 	struct zap_leaf_entry *le =
 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
 		    le->le_name_numints, 8, buflen / 8, buf);
 	} else {
 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
 		    le->le_name_numints, 1, buflen, buf);
 	}
 	if (le->le_name_numints > buflen)
 		return (SET_ERROR(EOVERFLOW));
 	return (0);
 }
 
 int
 zap_entry_update(zap_entry_handle_t *zeh,
 	uint8_t integer_size, uint64_t num_integers, const void *buf)
 {
 	int delta_chunks;
 	zap_leaf_t *l = zeh->zeh_leaf;
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
 
 	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
 
-	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
+	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
 		return (SET_ERROR(EAGAIN));
 
 	zap_leaf_array_free(l, &le->le_value_chunk);
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
 	le->le_value_numints = num_integers;
 	le->le_value_intlen = integer_size;
 	return (0);
 }
 
 void
 zap_entry_remove(zap_entry_handle_t *zeh)
 {
 	uint16_t entry_chunk;
 	struct zap_leaf_entry *le;
 	zap_leaf_t *l = zeh->zeh_leaf;
 
 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
 
 	entry_chunk = *zeh->zeh_chunkp;
 	le = ZAP_LEAF_ENTRY(l, entry_chunk);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	zap_leaf_array_free(l, &le->le_name_chunk);
 	zap_leaf_array_free(l, &le->le_value_chunk);
 
 	*zeh->zeh_chunkp = le->le_next;
 	zap_leaf_chunk_free(l, entry_chunk);
 
-	l->l_phys->l_hdr.lh_nentries--;
+	zap_leaf_phys(l)->l_hdr.lh_nentries--;
 }
 
 int
 zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
     uint8_t integer_size, uint64_t num_integers, const void *buf,
     zap_entry_handle_t *zeh)
 {
 	uint16_t chunk;
 	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
 	uint64_t valuelen;
 	int numchunks;
 	uint64_t h = zn->zn_hash;
 
 	valuelen = integer_size * num_integers;
 
 	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (E2BIG);
 
 	if (cd == ZAP_NEED_CD) {
 		/* find the lowest unused cd */
-		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+		if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
 			cd = 0;
 
 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
 			    chunk != CHAIN_END; chunk = le->le_next) {
 				le = ZAP_LEAF_ENTRY(l, chunk);
 				if (le->le_cd > cd)
 					break;
 				if (le->le_hash == h) {
 					ASSERT3U(cd, ==, le->le_cd);
 					cd++;
 				}
 			}
 		} else {
 			/* old unsorted format; do it the O(n^2) way */
 			for (cd = 0; ; cd++) {
 				for (chunk = *LEAF_HASH_ENTPTR(l, h);
 				    chunk != CHAIN_END; chunk = le->le_next) {
 					le = ZAP_LEAF_ENTRY(l, chunk);
 					if (le->le_hash == h &&
 					    le->le_cd == cd) {
 						break;
 					}
 				}
 				/* If this cd is not in use, we are good. */
 				if (chunk == CHAIN_END)
 					break;
 			}
 		}
 		/*
 		 * We would run out of space in a block before we could
 		 * store enough entries to run out of CD values.
 		 */
 		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
 	}
 
-	if (l->l_phys->l_hdr.lh_nfree < numchunks)
+	if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
 		return (SET_ERROR(EAGAIN));
 
 	/* make the entry */
 	chunk = zap_leaf_chunk_alloc(l);
 	le = ZAP_LEAF_ENTRY(l, chunk);
 	le->le_type = ZAP_CHUNK_ENTRY;
 	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
 	    zn->zn_key_intlen, zn->zn_key_orig_numints);
 	le->le_name_numints = zn->zn_key_orig_numints;
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
 	le->le_value_numints = num_integers;
 	le->le_value_intlen = integer_size;
 	le->le_hash = h;
 	le->le_cd = cd;
 
 	/* link it into the hash chain */
 	/* XXX if we did the search above, we could just use that */
 	chunkp = zap_leaf_rehash_entry(l, chunk);
 
-	l->l_phys->l_hdr.lh_nentries++;
+	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
 	zeh->zeh_leaf = l;
 	zeh->zeh_num_integers = num_integers;
 	zeh->zeh_integer_size = le->le_value_intlen;
 	zeh->zeh_cd = le->le_cd;
 	zeh->zeh_hash = le->le_hash;
 	zeh->zeh_chunkp = chunkp;
 
 	return (0);
 }
 
 /*
  * Determine if there is another entry with the same normalized form.
  * For performance purposes, either zn or name must be provided (the
  * other can be NULL).  Note, there usually won't be any hash
  * conflicts, in which case we don't need the concatenated/normalized
  * form of the name.  But all callers have one of these on hand anyway,
  * so might as well take advantage.  A cleaner but slower interface
  * would accept neither argument, and compute the normalized name as
  * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
  */
 boolean_t
 zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
     const char *name, zap_t *zap)
 {
 	uint64_t chunk;
 	struct zap_leaf_entry *le;
 	boolean_t allocdzn = B_FALSE;
 
 	if (zap->zap_normflags == 0)
 		return (B_FALSE);
 
 	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
 	    chunk != CHAIN_END; chunk = le->le_next) {
 		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
 		if (le->le_hash != zeh->zeh_hash)
 			continue;
 		if (le->le_cd == zeh->zeh_cd)
 			continue;
 
 		if (zn == NULL) {
 			zn = zap_name_alloc(zap, name, MT_FIRST);
 			allocdzn = B_TRUE;
 		}
 		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
 		    le->le_name_chunk, le->le_name_numints)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
 		}
 	}
 	if (allocdzn)
 		zap_name_free(zn);
 	return (B_FALSE);
 }
 
 /*
  * Routines for transferring entries between leafs.
  */
 
 static uint16_t *
 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	struct zap_leaf_entry *le2;
 	uint16_t *chunkp;
 
 	/*
 	 * keep the entry chain sorted by cd
 	 * NB: this will not cause problems for unsorted leafs, though
 	 * it is unnecessary there.
 	 */
 	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
 	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
 		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
 		if (le2->le_cd > le->le_cd)
 			break;
 	}
 
 	le->le_next = *chunkp;
 	*chunkp = entry;
 	return (chunkp);
 }
 
 static uint16_t
 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 {
 	uint16_t new_chunk;
 	uint16_t *nchunkp = &new_chunk;
 
 	while (chunk != CHAIN_END) {
 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
 		struct zap_leaf_array *nla =
 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
 		struct zap_leaf_array *la =
 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int nextchunk = la->la_next;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
 
 		*nla = *la; /* structure assignment */
 
 		zap_leaf_chunk_free(l, chunk);
 		chunk = nextchunk;
 		*nchunkp = nchunk;
 		nchunkp = &nla->la_next;
 	}
 	*nchunkp = CHAIN_END;
 	return (new_chunk);
 }
 
 static void
 zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 {
 	struct zap_leaf_entry *le, *nle;
 	uint16_t chunk;
 
 	le = ZAP_LEAF_ENTRY(l, entry);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	chunk = zap_leaf_chunk_alloc(nl);
 	nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
 	(void) zap_leaf_rehash_entry(nl, chunk);
 
 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
 	nle->le_value_chunk =
 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
 
 	zap_leaf_chunk_free(l, entry);
 
-	l->l_phys->l_hdr.lh_nentries--;
-	nl->l_phys->l_hdr.lh_nentries++;
+	zap_leaf_phys(l)->l_hdr.lh_nentries--;
+	zap_leaf_phys(nl)->l_hdr.lh_nentries++;
 }
 
 /*
  * Transfer the entries whose hash prefix ends in 1 to the new leaf.
  */
 void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
 	int i;
-	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
+	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
-	l->l_phys->l_hdr.lh_prefix <<= 1;
-	l->l_phys->l_hdr.lh_prefix_len++;
-	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
-	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
+	zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
+	zap_leaf_phys(nl)->l_hdr.lh_prefix =
+	    zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
+	zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
+	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
-	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
-		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 
 	/*
 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
 	 * the remaining entries
 	 *
 	 * NB: We could find entries via the hashtable instead. That
 	 * would be O(hashents+numents) rather than O(numblks+numents),
 	 * but this accesses memory more sequentially, and when we're
 	 * called, the block is usually pretty full.
 	 */
 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
 		if (le->le_type != ZAP_CHUNK_ENTRY)
 			continue;
 
 		if (le->le_hash & (1ULL << bit))
 			zap_leaf_transfer_entry(l, i, nl);
 		else
 			(void) zap_leaf_rehash_entry(l, i);
 	}
 }
 
 void
 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
 	int i, n;
 
-	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
-	    l->l_phys->l_hdr.lh_prefix_len;
+	n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
 
 
-	n = l->l_phys->l_hdr.lh_nentries/5;
+	n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_with_n5_entries[n]++;
 
 	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
-	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+	    zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
 	    (1<<FZAP_BLOCK_SHIFT(zap));
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
 		int nentries = 0;
-		int chunk = l->l_phys->l_hash[i];
+		int chunk = zap_leaf_phys(l)->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =
 			    ZAP_LEAF_ENTRY(l, chunk);
 
 			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
 			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
 			    le->le_value_intlen);
 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 			zs->zs_entries_using_n_chunks[n]++;
 
 			chunk = le->le_next;
 			nentries++;
 		}
 
 		n = nentries;
 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 		zs->zs_buckets_with_n_entries[n]++;
 	}
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	(revision 275782)
@@ -1,1466 +1,1468 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zap.h>
 #include <sys/refcount.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
 #include <sys/arc.h>
 #include <sys/dmu_objset.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
 #endif
 
+extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
+
 static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
 
 uint64_t
 zap_getflags(zap_t *zap)
 {
 	if (zap->zap_ismicro)
 		return (0);
-	return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+	return (zap_f_phys(zap)->zap_flags);
 }
 
 int
 zap_hashbits(zap_t *zap)
 {
 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
 		return (48);
 	else
 		return (28);
 }
 
 uint32_t
 zap_maxcd(zap_t *zap)
 {
 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
 		return ((1<<16)-1);
 	else
 		return (-1U);
 }
 
 static uint64_t
 zap_hash(zap_name_t *zn)
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t h = 0;
 
 	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
 		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
 		h = *(uint64_t *)zn->zn_key_orig;
 	} else {
 		h = zap->zap_salt;
 		ASSERT(h != 0);
 		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 
 		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
 			int i;
 			const uint64_t *wp = zn->zn_key_norm;
 
 			ASSERT(zn->zn_key_intlen == 8);
 			for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
 				int j;
 				uint64_t word = *wp;
 
 				for (j = 0; j < zn->zn_key_intlen; j++) {
 					h = (h >> 8) ^
 					    zfs_crc64_table[(h ^ word) & 0xFF];
 					word >>= NBBY;
 				}
 			}
 		} else {
 			int i, len;
 			const uint8_t *cp = zn->zn_key_norm;
 
 			/*
 			 * We previously stored the terminating null on
 			 * disk, but didn't hash it, so we need to
 			 * continue to not hash it.  (The
 			 * zn_key_*_numints includes the terminating
 			 * null for non-binary keys.)
 			 */
 			len = zn->zn_key_norm_numints - 1;
 
 			ASSERT(zn->zn_key_intlen == 1);
 			for (i = 0; i < len; cp++, i++) {
 				h = (h >> 8) ^
 				    zfs_crc64_table[(h ^ *cp) & 0xFF];
 			}
 		}
 	}
 	/*
 	 * Don't use all 64 bits, since we need some in the cookie for
 	 * the collision differentiator.  We MUST use the high bits,
 	 * since those are the ones that we first pay attention to when
 	 * chosing the bucket.
 	 */
 	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
 
 	return (h);
 }
 
 static int
 zap_normalize(zap_t *zap, const char *name, char *namenorm)
 {
 	size_t inlen, outlen;
 	int err;
 
 	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
 
 	inlen = strlen(name) + 1;
 	outlen = ZAP_MAXNAMELEN;
 
 	err = 0;
 	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
 	    zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
 	    U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
 
 	return (err);
 }
 
 boolean_t
 zap_match(zap_name_t *zn, const char *matchname)
 {
 	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
 
 	if (zn->zn_matchtype == MT_FIRST) {
 		char norm[ZAP_MAXNAMELEN];
 
 		if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
 			return (B_FALSE);
 
 		return (strcmp(zn->zn_key_norm, norm) == 0);
 	} else {
 		/* MT_BEST or MT_EXACT */
 		return (strcmp(zn->zn_key_orig, matchname) == 0);
 	}
 }
 
 void
 zap_name_free(zap_name_t *zn)
 {
 	kmem_free(zn, sizeof (zap_name_t));
 }
 
 zap_name_t *
 zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
 {
 	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
 
 	zn->zn_zap = zap;
 	zn->zn_key_intlen = sizeof (*key);
 	zn->zn_key_orig = key;
 	zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
 	zn->zn_matchtype = mt;
 	if (zap->zap_normflags) {
 		if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
 			zap_name_free(zn);
 			return (NULL);
 		}
 		zn->zn_key_norm = zn->zn_normbuf;
 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
 	} else {
 		if (mt != MT_EXACT) {
 			zap_name_free(zn);
 			return (NULL);
 		}
 		zn->zn_key_norm = zn->zn_key_orig;
 		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
 	}
 
 	zn->zn_hash = zap_hash(zn);
 	return (zn);
 }
 
 zap_name_t *
 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
 {
 	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
 
 	ASSERT(zap->zap_normflags == 0);
 	zn->zn_zap = zap;
 	zn->zn_key_intlen = sizeof (*key);
 	zn->zn_key_orig = zn->zn_key_norm = key;
 	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
 	zn->zn_matchtype = MT_EXACT;
 
 	zn->zn_hash = zap_hash(zn);
 	return (zn);
 }
 
 static void
 mzap_byteswap(mzap_phys_t *buf, size_t size)
 {
 	int i, max;
 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
 	buf->mz_salt = BSWAP_64(buf->mz_salt);
 	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
 	max = (size / MZAP_ENT_LEN) - 1;
 	for (i = 0; i < max; i++) {
 		buf->mz_chunk[i].mze_value =
 		    BSWAP_64(buf->mz_chunk[i].mze_value);
 		buf->mz_chunk[i].mze_cd =
 		    BSWAP_32(buf->mz_chunk[i].mze_cd);
 	}
 }
 
 void
 zap_byteswap(void *buf, size_t size)
 {
 	uint64_t block_type;
 
 	block_type = *(uint64_t *)buf;
 
 	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
 		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
 		mzap_byteswap(buf, size);
 	} else {
 		fzap_byteswap(buf, size);
 	}
 }
 
 static int
 mze_compare(const void *arg1, const void *arg2)
 {
 	const mzap_ent_t *mze1 = arg1;
 	const mzap_ent_t *mze2 = arg2;
 
 	if (mze1->mze_hash > mze2->mze_hash)
 		return (+1);
 	if (mze1->mze_hash < mze2->mze_hash)
 		return (-1);
 	if (mze1->mze_cd > mze2->mze_cd)
 		return (+1);
 	if (mze1->mze_cd < mze2->mze_cd)
 		return (-1);
 	return (0);
 }
 
 static int
 mze_insert(zap_t *zap, int chunkid, uint64_t hash)
 {
 	mzap_ent_t *mze;
 	avl_index_t idx;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
 	mze->mze_chunkid = chunkid;
 	mze->mze_hash = hash;
 	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
 	ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
 	if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) {
 		kmem_free(mze, sizeof (mzap_ent_t));
 		return (EEXIST);
 	}
 	avl_insert(&zap->zap_m.zap_avl, mze, idx);
 	return (0);
 }
 
 static mzap_ent_t *
 mze_find(zap_name_t *zn)
 {
 	mzap_ent_t mze_tofind;
 	mzap_ent_t *mze;
 	avl_index_t idx;
 	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
 
 	ASSERT(zn->zn_zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
 	mze_tofind.mze_hash = zn->zn_hash;
 	mze_tofind.mze_cd = 0;
 
 again:
 	mze = avl_find(avl, &mze_tofind, &idx);
 	if (mze == NULL)
 		mze = avl_nearest(avl, idx, AVL_AFTER);
 	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
 		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
 		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
 			return (mze);
 	}
 	if (zn->zn_matchtype == MT_BEST) {
 		zn->zn_matchtype = MT_FIRST;
 		goto again;
 	}
 	return (NULL);
 }
 
 static uint32_t
 mze_find_unused_cd(zap_t *zap, uint64_t hash)
 {
 	mzap_ent_t mze_tofind;
 	mzap_ent_t *mze;
 	avl_index_t idx;
 	avl_tree_t *avl = &zap->zap_m.zap_avl;
 	uint32_t cd;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	mze_tofind.mze_hash = hash;
 	mze_tofind.mze_cd = 0;
 
 	cd = 0;
 	for (mze = avl_find(avl, &mze_tofind, &idx);
 	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
 		if (mze->mze_cd != cd)
 			break;
 		cd++;
 	}
 
 	return (cd);
 }
 
 static void
 mze_remove(zap_t *zap, mzap_ent_t *mze)
 {
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	avl_remove(&zap->zap_m.zap_avl, mze);
 	kmem_free(mze, sizeof (mzap_ent_t));
 }
 
 static void
 mze_destroy(zap_t *zap)
 {
 	mzap_ent_t *mze;
 	void *avlcookie = NULL;
 
 	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
 		kmem_free(mze, sizeof (mzap_ent_t));
 	avl_destroy(&zap->zap_m.zap_avl);
 }
 
 static zap_t *
 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 {
 	zap_t *winner;
 	zap_t *zap;
 	int i;
 
 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
 
 	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 	rw_init(&zap->zap_rwlock, 0, 0, 0);
 	rw_enter(&zap->zap_rwlock, RW_WRITER);
 	zap->zap_objset = os;
 	zap->zap_object = obj;
 	zap->zap_dbuf = db;
 
 	if (*(uint64_t *)db->db_data != ZBT_MICRO) {
 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
 		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
 	} else {
 		zap->zap_ismicro = TRUE;
 	}
 
 	/*
 	 * Make sure that zap_ismicro is set before we let others see
 	 * it, because zap_lockdir() checks zap_ismicro without the lock
 	 * held.
 	 */
-	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
+	winner = dmu_buf_set_user(db, zap, zap_evict);
 
 	if (winner != NULL) {
 		rw_exit(&zap->zap_rwlock);
 		rw_destroy(&zap->zap_rwlock);
 		if (!zap->zap_ismicro)
 			mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 		kmem_free(zap, sizeof (zap_t));
 		return (winner);
 	}
 
 	if (zap->zap_ismicro) {
-		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
-		zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
+		zap->zap_salt = zap_m_phys(zap)->mz_salt;
+		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
 		avl_create(&zap->zap_m.zap_avl, mze_compare,
 		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
 
 		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 			mzap_ent_phys_t *mze =
-			    &zap->zap_m.zap_phys->mz_chunk[i];
+			    &zap_m_phys(zap)->mz_chunk[i];
 			if (mze->mze_name[0]) {
 				zap_name_t *zn;
 
 				zn = zap_name_alloc(zap, mze->mze_name,
 				    MT_EXACT);
 				if (mze_insert(zap, i, zn->zn_hash) == 0)
 					zap->zap_m.zap_num_entries++;
 				else {
 					printf("ZFS WARNING: Duplicated ZAP "
 					    "entry detected (%s).\n",
 					    mze->mze_name);
 				}
 				zap_name_free(zn);
 			}
 		}
 	} else {
-		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
-		zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
+		zap->zap_salt = zap_f_phys(zap)->zap_salt;
+		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
 
 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
 		    2*ZAP_LEAF_CHUNKSIZE);
 
 		/*
 		 * The embedded pointer table should not overlap the
 		 * other members.
 		 */
 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
-		    &zap->zap_f.zap_phys->zap_salt);
+		    &zap_f_phys(zap)->zap_salt);
 
 		/*
 		 * The embedded pointer table should end at the end of
 		 * the block
 		 */
 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
-		    (uintptr_t)zap->zap_f.zap_phys, ==,
+		    (uintptr_t)zap_f_phys(zap), ==,
 		    zap->zap_dbuf->db_size);
 	}
 	rw_exit(&zap->zap_rwlock);
 	return (zap);
 }
 
 int
 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	zap_t *zap;
 	dmu_buf_t *db;
 	krw_t lt;
 	int err;
 
 	*zapp = NULL;
 
 	err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
 	if (err)
 		return (err);
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(db, &doi);
 		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 	}
 #endif
 
 	zap = dmu_buf_get_user(db);
 	if (zap == NULL)
 		zap = mzap_open(os, obj, db);
 
 	/*
 	 * We're checking zap_ismicro without the lock held, in order to
 	 * tell what type of lock we want.  Once we have some sort of
 	 * lock, see if it really is the right type.  In practice this
 	 * can only be different if it was upgraded from micro to fat,
 	 * and micro wanted WRITER but fat only needs READER.
 	 */
 	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
 	rw_enter(&zap->zap_rwlock, lt);
 	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
 		/* it was upgraded, now we only need reader */
 		ASSERT(lt == RW_WRITER);
 		ASSERT(RW_READER ==
 		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
 		rw_downgrade(&zap->zap_rwlock);
 		lt = RW_READER;
 	}
 
 	zap->zap_objset = os;
 
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
 
 	ASSERT3P(zap->zap_dbuf, ==, db);
 
 	ASSERT(!zap->zap_ismicro ||
 	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
 	if (zap->zap_ismicro && tx && adding &&
 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
 		if (newsz > MZAP_MAX_BLKSZ) {
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
 			return (mzap_upgrade(zapp, tx, 0));
 		}
 		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
 		ASSERT0(err);
 		zap->zap_m.zap_num_chunks =
 		    db->db_size / MZAP_ENT_LEN - 1;
 	}
 
 	*zapp = zap;
 	return (0);
 }
 
 void
 zap_unlockdir(zap_t *zap)
 {
 	rw_exit(&zap->zap_rwlock);
 	dmu_buf_rele(zap->zap_dbuf, NULL);
 }
 
 static int
 mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 {
 	mzap_phys_t *mzp;
 	int i, sz, nchunks;
 	int err = 0;
 	zap_t *zap = *zapp;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	sz = zap->zap_dbuf->db_size;
 	mzp = kmem_alloc(sz, KM_SLEEP);
 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
 	nchunks = zap->zap_m.zap_num_chunks;
 
 	if (!flags) {
 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
 		    1ULL << fzap_default_block_shift, 0, tx);
 		if (err) {
 			kmem_free(mzp, sz);
 			return (err);
 		}
 	}
 
 	dprintf("upgrading obj=%llu with %u chunks\n",
 	    zap->zap_object, nchunks);
 	/* XXX destroy the avl later, so we can use the stored hash value */
 	mze_destroy(zap);
 
 	fzap_upgrade(zap, tx, flags);
 
 	for (i = 0; i < nchunks; i++) {
 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
 		zap_name_t *zn;
 		if (mze->mze_name[0] == 0)
 			continue;
 		dprintf("adding %s=%llu\n",
 		    mze->mze_name, mze->mze_value);
 		zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
 		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
 		zap_name_free(zn);
 		if (err)
 			break;
 	}
 	kmem_free(mzp, sz);
 	*zapp = zap;
 	return (err);
 }
 
 void
 mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
     dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	mzap_phys_t *zp;
 
 	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(db, &doi);
 		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 	}
 #endif
 
 	dmu_buf_will_dirty(db, tx);
 	zp = db->db_data;
 	zp->mz_block_type = ZBT_MICRO;
 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
 	zp->mz_normflags = normflags;
 	dmu_buf_rele(db, FTAG);
 
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
 		VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
 		    B_FALSE, B_FALSE, &zap));
 		VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
 		zap_unlockdir(zap);
 	}
 }
 
 int
 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (zap_create_claim_norm(os, obj,
 	    0, ot, bonustype, bonuslen, tx));
 }
 
 int
 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
     dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	int err;
 
 	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
 	if (err != 0)
 		return (err);
 	mzap_create_impl(os, obj, normflags, 0, tx);
 	return (0);
 }
 
 uint64_t
 zap_create(objset_t *os, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
 }
 
 uint64_t
 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
 	mzap_create_impl(os, obj, normflags, 0, tx);
 	return (obj);
 }
 
 uint64_t
 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
 	ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
 	    leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
 	    indirect_blockshift >= SPA_MINBLOCKSHIFT &&
 	    indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
 
 	VERIFY(dmu_object_set_blocksize(os, obj,
 	    1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
 
 	mzap_create_impl(os, obj, normflags, flags, tx);
 	return (obj);
 }
 
 int
 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
 {
 	/*
 	 * dmu_object_free will free the object number and free the
 	 * data.  Freeing the data will cause our pageout function to be
 	 * called, which will destroy our data (zap_leaf_t's and zap_t).
 	 */
 
 	return (dmu_object_free(os, zapobj, tx));
 }
 
 _NOTE(ARGSUSED(0))
 void
 zap_evict(dmu_buf_t *db, void *vzap)
 {
 	zap_t *zap = vzap;
 
 	rw_destroy(&zap->zap_rwlock);
 
 	if (zap->zap_ismicro)
 		mze_destroy(zap);
 	else
 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 
 	kmem_free(zap, sizeof (zap_t));
 }
 
 int
 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 {
 	zap_t *zap;
 	int err;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	if (!zap->zap_ismicro) {
 		err = fzap_count(zap, count);
 	} else {
 		*count = zap->zap_m.zap_num_entries;
 	}
 	zap_unlockdir(zap);
 	return (err);
 }
 
 /*
  * zn may be NULL; if not specified, it will be computed if needed.
  * See also the comment above zap_entry_normalization_conflict().
  */
 static boolean_t
 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
 {
 	mzap_ent_t *other;
 	int direction = AVL_BEFORE;
 	boolean_t allocdzn = B_FALSE;
 
 	if (zap->zap_normflags == 0)
 		return (B_FALSE);
 
 again:
 	for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
 	    other && other->mze_hash == mze->mze_hash;
 	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
 
 		if (zn == NULL) {
 			zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
 			    MT_FIRST);
 			allocdzn = B_TRUE;
 		}
 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
 		}
 	}
 
 	if (direction == AVL_BEFORE) {
 		direction = AVL_AFTER;
 		goto again;
 	}
 
 	if (allocdzn)
 		zap_name_free(zn);
 	return (B_FALSE);
 }
 
 /*
  * Routines for manipulating attributes.
  */
 
 int
 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	return (zap_lookup_norm(os, zapobj, name, integer_size,
 	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
 }
 
 int
 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *ncp)
 {
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, mt);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	if (!zap->zap_ismicro) {
 		err = fzap_lookup(zn, integer_size, num_integers, buf,
 		    realname, rn_len, ncp);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			if (num_integers < 1) {
 				err = SET_ERROR(EOVERFLOW);
 			} else if (integer_size != 8) {
 				err = SET_ERROR(EINVAL);
 			} else {
 				*(uint64_t *)buf =
 				    MZE_PHYS(zap, mze)->mze_value;
 				(void) strlcpy(realname,
 				    MZE_PHYS(zap, mze)->mze_name, rn_len);
 				if (ncp) {
 					*ncp = mzap_normalization_conflict(zap,
 					    zn, mze);
 				}
 			}
 		}
 	}
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	fzap_prefetch(zn);
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	err = fzap_lookup(zn, integer_size, num_integers, buf,
 	    NULL, 0, NULL);
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
 {
 	int err = zap_lookup_norm(os, zapobj, name, 0,
 	    0, NULL, MT_EXACT, NULL, 0, NULL);
 	if (err == EOVERFLOW || err == EINVAL)
 		err = 0; /* found, but skipped reading the value */
 	return (err);
 }
 
 int
 zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_length(zn, integer_size, num_integers);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			if (integer_size)
 				*integer_size = 8;
 			if (num_integers)
 				*num_integers = 1;
 		}
 	}
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_length(zn, integer_size, num_integers);
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 static void
 mzap_addent(zap_name_t *zn, uint64_t value)
 {
 	int i;
 	zap_t *zap = zn->zn_zap;
 	int start = zap->zap_m.zap_alloc_next;
 	uint32_t cd;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 #ifdef ZFS_DEBUG
 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 	}
 #endif
 
 	cd = mze_find_unused_cd(zap, zn->zn_hash);
 	/* given the limited size of the microzap, this can't happen */
 	ASSERT(cd < zap_maxcd(zap));
 
 again:
 	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
 			mze->mze_cd = cd;
 			(void) strcpy(mze->mze_name, zn->zn_key_orig);
 			zap->zap_m.zap_num_entries++;
 			zap->zap_m.zap_alloc_next = i+1;
 			if (zap->zap_m.zap_alloc_next ==
 			    zap->zap_m.zap_num_chunks)
 				zap->zap_m.zap_alloc_next = 0;
 			VERIFY(0 == mze_insert(zap, i, zn->zn_hash));
 			return;
 		}
 	}
 	if (start != 0) {
 		start = 0;
 		goto again;
 	}
 	ASSERT(!"out of entries!");
 }
 
 int
 zap_add(objset_t *os, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
 	const uint64_t *intval = val;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, key, MT_EXACT);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_add(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(key) >= MZAP_NAME_LEN) {
 		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0)
 			err = fzap_add(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
 		mze = mze_find(zn);
 		if (mze != NULL) {
 			err = SET_ERROR(EEXIST);
 		} else {
 			mzap_addent(zn, *intval);
 		}
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
 		zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_add(zn, integer_size, num_integers, val, tx);
 	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
 		zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	mzap_ent_t *mze;
 	uint64_t oldval;
 	const uint64_t *intval = val;
 	zap_name_t *zn;
 	int err;
 
 #ifdef ZFS_DEBUG
 	/*
 	 * If there is an old value, it shouldn't change across the
 	 * lockdir (eg, due to bprewrite's xlation).
 	 */
 	if (integer_size == 8 && num_integers == 1)
 		(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
 #endif
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_update(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    zapobj, integer_size, num_integers, name);
 		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0)
 			err = fzap_update(zn, integer_size, num_integers,
 			    val, tx);
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
 		mze = mze_find(zn);
 		if (mze != NULL) {
 			ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
 			MZE_PHYS(zap, mze)->mze_value = *intval;
 		} else {
 			mzap_addent(zn, *intval);
 		}
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
 		zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	zap_name_t *zn;
 	int err;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_update(zn, integer_size, num_integers, val, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
 		zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
 {
 	return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
 }
 
 int
 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
     matchtype_t mt, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, mt);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_remove(zn, tx);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			zap->zap_m.zap_num_entries--;
-			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+			bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
 			    sizeof (mzap_ent_phys_t));
 			mze_remove(zap, mze);
 		}
 	}
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_remove(zn, tx);
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 /*
  * Routines for iterating over the attributes.
  */
 
 void
 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
     uint64_t serialized)
 {
 	zc->zc_objset = os;
 	zc->zc_zap = NULL;
 	zc->zc_leaf = NULL;
 	zc->zc_zapobj = zapobj;
 	zc->zc_serialized = serialized;
 	zc->zc_hash = 0;
 	zc->zc_cd = 0;
 }
 
 void
 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
 	zap_cursor_init_serialized(zc, os, zapobj, 0);
 }
 
 void
 zap_cursor_fini(zap_cursor_t *zc)
 {
 	if (zc->zc_zap) {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 		zap_unlockdir(zc->zc_zap);
 		zc->zc_zap = NULL;
 	}
 	if (zc->zc_leaf) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 		zap_put_leaf(zc->zc_leaf);
 		zc->zc_leaf = NULL;
 	}
 	zc->zc_objset = NULL;
 }
 
 uint64_t
 zap_cursor_serialize(zap_cursor_t *zc)
 {
 	if (zc->zc_hash == -1ULL)
 		return (-1ULL);
 	if (zc->zc_zap == NULL)
 		return (zc->zc_serialized);
 	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
 	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
 
 	/*
 	 * We want to keep the high 32 bits of the cursor zero if we can, so
 	 * that 32-bit programs can access this.  So usually use a small
 	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
 	 * of the cursor.
 	 *
 	 * [ collision differentiator | zap_hashbits()-bit hash value ]
 	 */
 	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
 	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
 }
 
 int
 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 {
 	int err;
 	avl_index_t idx;
 	mzap_ent_t mze_tofind;
 	mzap_ent_t *mze;
 
 	if (zc->zc_hash == -1ULL)
 		return (SET_ERROR(ENOENT));
 
 	if (zc->zc_zap == NULL) {
 		int hb;
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
 		    RW_READER, TRUE, FALSE, &zc->zc_zap);
 		if (err)
 			return (err);
 
 		/*
 		 * To support zap_cursor_init_serialized, advance, retrieve,
 		 * we must add to the existing zc_cd, which may already
 		 * be 1 due to the zap_cursor_advance.
 		 */
 		ASSERT(zc->zc_hash == 0);
 		hb = zap_hashbits(zc->zc_zap);
 		zc->zc_hash = zc->zc_serialized << (64 - hb);
 		zc->zc_cd += zc->zc_serialized >> hb;
 		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
 			zc->zc_cd = 0;
 	} else {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 	}
 	if (!zc->zc_zap->zap_ismicro) {
 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
 	} else {
 		mze_tofind.mze_hash = zc->zc_hash;
 		mze_tofind.mze_cd = zc->zc_cd;
 
 		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
 		if (mze == NULL) {
 			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
 			    idx, AVL_AFTER);
 		}
 		if (mze) {
 			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
 			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
 			za->za_normalization_conflict =
 			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
 			za->za_integer_length = 8;
 			za->za_num_integers = 1;
 			za->za_first_integer = mzep->mze_value;
 			(void) strcpy(za->za_name, mzep->mze_name);
 			zc->zc_hash = mze->mze_hash;
 			zc->zc_cd = mze->mze_cd;
 			err = 0;
 		} else {
 			zc->zc_hash = -1ULL;
 			err = SET_ERROR(ENOENT);
 		}
 	}
 	rw_exit(&zc->zc_zap->zap_rwlock);
 	return (err);
 }
 
 void
 zap_cursor_advance(zap_cursor_t *zc)
 {
 	if (zc->zc_hash == -1ULL)
 		return;
 	zc->zc_cd++;
 }
 
 int
 zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
 {
 	int err = 0;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
 	if (zc->zc_zap == NULL) {
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
 		    RW_READER, TRUE, FALSE, &zc->zc_zap);
 		if (err)
 			return (err);
 	} else {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 	}
 
 	zn = zap_name_alloc(zc->zc_zap, name, mt);
 	if (zn == NULL) {
 		rw_exit(&zc->zc_zap->zap_rwlock);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	if (!zc->zc_zap->zap_ismicro) {
 		err = fzap_cursor_move_to_key(zc, zn);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 			goto out;
 		}
 		zc->zc_hash = mze->mze_hash;
 		zc->zc_cd = mze->mze_cd;
 	}
 
 out:
 	zap_name_free(zn);
 	rw_exit(&zc->zc_zap->zap_rwlock);
 	return (err);
 }
 
 int
 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 {
 	int err;
 	zap_t *zap;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 
 	bzero(zs, sizeof (zap_stats_t));
 
 	if (zap->zap_ismicro) {
 		zs->zs_blocksize = zap->zap_dbuf->db_size;
 		zs->zs_num_entries = zap->zap_m.zap_num_entries;
 		zs->zs_num_blocks = 1;
 	} else {
 		fzap_get_stats(zap, zs);
 	}
 	zap_unlockdir(zap);
 	return (0);
 }
 
 int
 zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
     uint64_t *towrite, uint64_t *tooverwrite)
 {
 	zap_t *zap;
 	int err = 0;
 
 	/*
 	 * Since, we don't have a name, we cannot figure out which blocks will
 	 * be affected in this operation. So, account for the worst case :
 	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
 	 * - 4 new blocks written if adding:
 	 * 	- 2 blocks for possibly split leaves,
 	 * 	- 2 grown ptrtbl blocks
 	 *
 	 * This also accomodates the case where an add operation to a fairly
 	 * large microzap results in a promotion to fatzap.
 	 */
 	if (name == NULL) {
 		*towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
 		return (err);
 	}
 
 	/*
 	 * We lock the zap with adding == FALSE. Because, if we pass
 	 * the actual value of add, it could trigger a mzap_upgrade().
 	 * At present we are just evaluating the possibility of this operation
 	 * and hence we donot want to trigger an upgrade.
 	 */
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 
 	if (!zap->zap_ismicro) {
 		zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT);
 		if (zn) {
 			err = fzap_count_write(zn, add, towrite,
 			    tooverwrite);
 			zap_name_free(zn);
 		} else {
 			/*
 			 * We treat this case as similar to (name == NULL)
 			 */
 			*towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
 		}
 	} else {
 		/*
 		 * We are here if (name != NULL) and this is a micro-zap.
 		 * We account for the header block depending on whether it
 		 * is freeable.
 		 *
 		 * Incase of an add-operation it is hard to find out
 		 * if this add will promote this microzap to fatzap.
 		 * Hence, we consider the worst case and account for the
 		 * blocks assuming this microzap would be promoted to a
 		 * fatzap.
 		 *
 		 * 1 block overwritten  : header block
 		 * 4 new blocks written : 2 new split leaf, 2 grown
 		 *			ptrtbl blocks
 		 */
 		if (dmu_buf_freeable(zap->zap_dbuf))
 			*tooverwrite += MZAP_MAX_BLKSZ;
 		else
 			*towrite += MZAP_MAX_BLKSZ;
 
 		if (add) {
 			*towrite += 4 * MZAP_MAX_BLKSZ;
 		}
 	}
 
 	zap_unlockdir(zap);
 	return (err);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	(revision 275782)
@@ -1,6558 +1,6559 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011-2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
  */
 
 /*
  * ZFS ioctls.
  *
  * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
  * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
  *
  * There are two ways that we handle ioctls: the legacy way where almost
  * all of the logic is in the ioctl callback, and the new way where most
  * of the marshalling is handled in the common entry point, zfsdev_ioctl().
  *
  * Non-legacy ioctls should be registered by calling
  * zfs_ioctl_register() from zfs_ioctl_init().  The ioctl is invoked
  * from userland by lzc_ioctl().
  *
  * The registration arguments are as follows:
  *
  * const char *name
  *   The name of the ioctl.  This is used for history logging.  If the
  *   ioctl returns successfully (the callback returns 0), and allow_log
  *   is true, then a history log entry will be recorded with the input &
  *   output nvlists.  The log entry can be printed with "zpool history -i".
  *
  * zfs_ioc_t ioc
  *   The ioctl request number, which userland will pass to ioctl(2).
  *   The ioctl numbers can change from release to release, because
  *   the caller (libzfs) must be matched to the kernel.
  *
  * zfs_secpolicy_func_t *secpolicy
  *   This function will be called before the zfs_ioc_func_t, to
  *   determine if this operation is permitted.  It should return EPERM
  *   on failure, and 0 on success.  Checks include determining if the
  *   dataset is visible in this zone, and if the user has either all
  *   zfs privileges in the zone (SYS_MOUNT), or has been granted permission
  *   to do this operation on this dataset with "zfs allow".
  *
  * zfs_ioc_namecheck_t namecheck
  *   This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
  *   name, a dataset name, or nothing.  If the name is not well-formed,
  *   the ioctl will fail and the callback will not be called.
  *   Therefore, the callback can assume that the name is well-formed
  *   (e.g. is null-terminated, doesn't have more than one '@' character,
  *   doesn't have invalid characters).
  *
  * zfs_ioc_poolcheck_t pool_check
  *   This specifies requirements on the pool state.  If the pool does
  *   not meet them (is suspended or is readonly), the ioctl will fail
  *   and the callback will not be called.  If any checks are specified
  *   (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
  *   Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
  *   POOL_CHECK_READONLY).
  *
  * boolean_t smush_outnvlist
  *   If smush_outnvlist is true, then the output is presumed to be a
  *   list of errors, and it will be "smushed" down to fit into the
  *   caller's buffer, by removing some entries and replacing them with a
  *   single "N_MORE_ERRORS" entry indicating how many were removed.  See
  *   nvlist_smush() for details.  If smush_outnvlist is false, and the
  *   outnvlist does not fit into the userland-provided buffer, then the
  *   ioctl will fail with ENOMEM.
  *
  * zfs_ioc_func_t *func
  *   The callback function that will perform the operation.
  *
  *   The callback should return 0 on success, or an error number on
  *   failure.  If the function fails, the userland ioctl will return -1,
  *   and errno will be set to the callback's return value.  The callback
  *   will be called with the following arguments:
  *
  *   const char *name
  *     The name of the pool or dataset to operate on, from
  *     zfs_cmd_t:zc_name.  The 'namecheck' argument specifies the
  *     expected type (pool, dataset, or none).
  *
  *   nvlist_t *innvl
  *     The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src.  Or
  *     NULL if no input nvlist was provided.  Changes to this nvlist are
  *     ignored.  If the input nvlist could not be deserialized, the
  *     ioctl will fail and the callback will not be called.
  *
  *   nvlist_t *outnvl
  *     The output nvlist, initially empty.  The callback can fill it in,
  *     and it will be returned to userland by serializing it into
  *     zfs_cmd_t:zc_nvlist_dst.  If it is non-empty, and serialization
  *     fails (e.g. because the caller didn't supply a large enough
  *     buffer), then the overall ioctl will fail.  See the
  *     'smush_nvlist' argument above for additional behaviors.
  *
  *     There are two typical uses of the output nvlist:
  *       - To return state, e.g. property values.  In this case,
  *         smush_outnvlist should be false.  If the buffer was not large
  *         enough, the caller will reallocate a larger buffer and try
  *         the ioctl again.
  *
  *       - To return multiple errors from an ioctl which makes on-disk
  *         changes.  In this case, smush_outnvlist should be true.
  *         Ioctls which make on-disk modifications should generally not
  *         use the outnvl if they succeed, because the caller can not
  *         distinguish between the operation failing, and
  *         deserialization failing.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/sunddi.h>
 #include <sys/policy.h>
 #include <sys/zone.h>
 #include <sys/nvpair.h>
 #include <sys/mount.h>
 #include <sys/taskqueue.h>
 #include <sys/sdt.h>
 #include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "zfs_deleg.h"
 #include "zfs_comutil.h"
 #include "zfs_ioctl_compat.h"
 
 CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX);
 
 static int snapshot_list_prefetch;
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, snapshot_list_prefetch, CTLFLAG_RWTUN,
     &snapshot_list_prefetch, 0, "Prefetch data when listing snapshots");
 
 static struct cdev *zfsdev;
 
 extern void zfs_init(void);
 extern void zfs_fini(void);
 
 uint_t zfs_fsyncer_key;
 extern uint_t rrw_tsd_key;
 static uint_t zfs_allow_log_key;
 
 typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
 typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
 typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *);
 
 typedef enum {
 	NO_NAME,
 	POOL_NAME,
 	DATASET_NAME
 } zfs_ioc_namecheck_t;
 
 typedef enum {
 	POOL_CHECK_NONE		= 1 << 0,
 	POOL_CHECK_SUSPENDED	= 1 << 1,
 	POOL_CHECK_READONLY	= 1 << 2,
 } zfs_ioc_poolcheck_t;
 
 typedef struct zfs_ioc_vec {
 	zfs_ioc_legacy_func_t	*zvec_legacy_func;
 	zfs_ioc_func_t		*zvec_func;
 	zfs_secpolicy_func_t	*zvec_secpolicy;
 	zfs_ioc_namecheck_t	zvec_namecheck;
 	boolean_t		zvec_allow_log;
 	zfs_ioc_poolcheck_t	zvec_pool_check;
 	boolean_t		zvec_smush_outnvlist;
 	const char		*zvec_name;
 } zfs_ioc_vec_t;
 
 /* This array is indexed by zfs_userquota_prop_t */
 static const char *userquota_perms[] = {
 	ZFS_DELEG_PERM_USERUSED,
 	ZFS_DELEG_PERM_USERQUOTA,
 	ZFS_DELEG_PERM_GROUPUSED,
 	ZFS_DELEG_PERM_GROUPQUOTA,
 };
 
 static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
 static int zfs_check_settable(const char *name, nvpair_t *property,
     cred_t *cr);
 static int zfs_check_clearable(char *dataset, nvlist_t *props,
     nvlist_t **errors);
 static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
     boolean_t *);
 int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
 static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
  
 static void zfsdev_close(void *data);
 
 static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
 
 /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
 void
 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 {
 	const char *newfile;
 	char buf[512];
 	va_list adx;
 
 	/*
 	 * Get rid of annoying "../common/" prefix to filename.
 	 */
 	newfile = strrchr(file, '/');
 	if (newfile != NULL) {
 		newfile = newfile + 1; /* Get rid of leading / */
 	} else {
 		newfile = file;
 	}
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	/*
 	 * To get this data, use the zfs-dprintf probe as so:
 	 * dtrace -q -n 'zfs-dprintf \
 	 *	/stringof(arg0) == "dbuf.c"/ \
 	 *	{printf("%s: %s", stringof(arg1), stringof(arg3))}'
 	 * arg0 = file name
 	 * arg1 = function name
 	 * arg2 = line number
 	 * arg3 = message
 	 */
 	DTRACE_PROBE4(zfs__dprintf,
 	    char *, newfile, char *, func, int, line, char *, buf);
 }
 
 static void
 history_str_free(char *buf)
 {
 	kmem_free(buf, HIS_MAX_RECORD_LEN);
 }
 
 static char *
 history_str_get(zfs_cmd_t *zc)
 {
 	char *buf;
 
 	if (zc->zc_history == 0)
 		return (NULL);
 
 	buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
 	if (copyinstr((void *)(uintptr_t)zc->zc_history,
 	    buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
 		history_str_free(buf);
 		return (NULL);
 	}
 
 	buf[HIS_MAX_RECORD_LEN -1] = '\0';
 
 	return (buf);
 }
 
 /*
  * Check to see if the named dataset is currently defined as bootable
  */
 static boolean_t
 zfs_is_bootfs(const char *name)
 {
 	objset_t *os;
 
 	if (dmu_objset_hold(name, FTAG, &os) == 0) {
 		boolean_t ret;
 		ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
 		dmu_objset_rele(os, FTAG);
 		return (ret);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Return non-zero if the spa version is less than requested version.
  */
 static int
 zfs_earlier_version(const char *name, int version)
 {
 	spa_t *spa;
 
 	if (spa_open(name, &spa, FTAG) == 0) {
 		if (spa_version(spa) < version) {
 			spa_close(spa, FTAG);
 			return (1);
 		}
 		spa_close(spa, FTAG);
 	}
 	return (0);
 }
 
 /*
  * Return TRUE if the ZPL version is less than requested version.
  */
 static boolean_t
 zpl_earlier_version(const char *name, int version)
 {
 	objset_t *os;
 	boolean_t rc = B_TRUE;
 
 	if (dmu_objset_hold(name, FTAG, &os) == 0) {
 		uint64_t zplversion;
 
 		if (dmu_objset_type(os) != DMU_OST_ZFS) {
 			dmu_objset_rele(os, FTAG);
 			return (B_TRUE);
 		}
 		/* XXX reading from non-owned objset */
 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
 			rc = zplversion < version;
 		dmu_objset_rele(os, FTAG);
 	}
 	return (rc);
 }
 
 static void
 zfs_log_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *buf;
 
 	if ((buf = history_str_get(zc)) == NULL)
 		return;
 
 	if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
 			(void) spa_history_log(spa, buf);
 		spa_close(spa, FTAG);
 	}
 	history_str_free(buf);
 }
 
 /*
  * Policy for top-level read operations (list pools).  Requires no privileges,
  * and can be used in the local zone, as there is no associated dataset.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (0);
 }
 
 /*
  * Policy for dataset read operations (list children, get statistics).  Requires
  * no privileges, but must be visible in the local zone.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	if (INGLOBALZONE(curthread) ||
 	    zone_dataset_visible(zc->zc_name, NULL))
 		return (0);
 
 	return (SET_ERROR(ENOENT));
 }
 
 static int
 zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
 {
 	int writable = 1;
 
 	/*
 	 * The dataset must be visible by this zone -- check this first
 	 * so they don't see EPERM on something they shouldn't know about.
 	 */
 	if (!INGLOBALZONE(curthread) &&
 	    !zone_dataset_visible(dataset, &writable))
 		return (SET_ERROR(ENOENT));
 
 	if (INGLOBALZONE(curthread)) {
 		/*
 		 * If the fs is zoned, only root can access it from the
 		 * global zone.
 		 */
 		if (secpolicy_zfs(cr) && zoned)
 			return (SET_ERROR(EPERM));
 	} else {
 		/*
 		 * If we are in a local zone, the 'zoned' property must be set.
 		 */
 		if (!zoned)
 			return (SET_ERROR(EPERM));
 
 		/* must be writable by this zone */
 		if (!writable)
 			return (SET_ERROR(EPERM));
 	}
 	return (0);
 }
 
 static int
 zfs_dozonecheck(const char *dataset, cred_t *cr)
 {
 	uint64_t zoned;
 
 	if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
 		return (SET_ERROR(ENOENT));
 
 	return (zfs_dozonecheck_impl(dataset, zoned, cr));
 }
 
 static int
 zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
 {
 	uint64_t zoned;
 
 	if (dsl_prop_get_int_ds(ds, "jailed", &zoned))
 		return (SET_ERROR(ENOENT));
 
 	return (zfs_dozonecheck_impl(dataset, zoned, cr));
 }
 
 static int
 zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
     const char *perm, cred_t *cr)
 {
 	int error;
 
 	error = zfs_dozonecheck_ds(name, ds, cr);
 	if (error == 0) {
 		error = secpolicy_zfs(cr);
 		if (error != 0)
 			error = dsl_deleg_access_impl(ds, perm, cr);
 	}
 	return (error);
 }
 
 static int
 zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
 {
 	int error;
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp;
 
 	error = dsl_pool_hold(name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, name, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
 
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 #ifdef SECLABEL
 /*
  * Policy for setting the security label property.
  *
  * Returns 0 for success, non-zero for access and other errors.
  */
 static int
 zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
 {
 	char		ds_hexsl[MAXNAMELEN];
 	bslabel_t	ds_sl, new_sl;
 	boolean_t	new_default = FALSE;
 	uint64_t	zoned;
 	int		needed_priv = -1;
 	int		error;
 
 	/* First get the existing dataset label. */
 	error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
 	if (error != 0)
 		return (SET_ERROR(EPERM));
 
 	if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
 		new_default = TRUE;
 
 	/* The label must be translatable */
 	if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * In a non-global zone, disallow attempts to set a label that
 	 * doesn't match that of the zone; otherwise no other checks
 	 * are needed.
 	 */
 	if (!INGLOBALZONE(curproc)) {
 		if (new_default || !blequal(&new_sl, CR_SL(CRED())))
 			return (SET_ERROR(EPERM));
 		return (0);
 	}
 
 	/*
 	 * For global-zone datasets (i.e., those whose zoned property is
 	 * "off", verify that the specified new label is valid for the
 	 * global zone.
 	 */
 	if (dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
 		return (SET_ERROR(EPERM));
 	if (!zoned) {
 		if (zfs_check_global_label(name, strval) != 0)
 			return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * If the existing dataset label is nondefault, check if the
 	 * dataset is mounted (label cannot be changed while mounted).
 	 * Get the zfsvfs; if there isn't one, then the dataset isn't
 	 * mounted (or isn't a dataset, doesn't exist, ...).
 	 */
 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
 		objset_t *os;
 		static char *setsl_tag = "setsl_tag";
 
 		/*
 		 * Try to own the dataset; abort if there is any error,
 		 * (e.g., already mounted, in use, or other error).
 		 */
 		error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
 		    setsl_tag, &os);
 		if (error != 0)
 			return (SET_ERROR(EPERM));
 
 		dmu_objset_disown(os, setsl_tag);
 
 		if (new_default) {
 			needed_priv = PRIV_FILE_DOWNGRADE_SL;
 			goto out_check;
 		}
 
 		if (hexstr_to_label(strval, &new_sl) != 0)
 			return (SET_ERROR(EPERM));
 
 		if (blstrictdom(&ds_sl, &new_sl))
 			needed_priv = PRIV_FILE_DOWNGRADE_SL;
 		else if (blstrictdom(&new_sl, &ds_sl))
 			needed_priv = PRIV_FILE_UPGRADE_SL;
 	} else {
 		/* dataset currently has a default label */
 		if (!new_default)
 			needed_priv = PRIV_FILE_UPGRADE_SL;
 	}
 
 out_check:
 	if (needed_priv != -1)
 		return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
 	return (0);
 }
 #endif	/* SECLABEL */
 
 static int
 zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
     cred_t *cr)
 {
 	char *strval;
 
 	/*
 	 * Check permissions for special properties.
 	 */
 	switch (prop) {
 	case ZFS_PROP_ZONED:
 		/*
 		 * Disallow setting of 'zoned' from within a local zone.
 		 */
 		if (!INGLOBALZONE(curthread))
 			return (SET_ERROR(EPERM));
 		break;
 
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 		if (!INGLOBALZONE(curthread)) {
 			uint64_t zoned;
 			char setpoint[MAXNAMELEN];
 			/*
 			 * Unprivileged users are allowed to modify the
 			 * limit on things *under* (ie. contained by)
 			 * the thing they own.
 			 */
 			if (dsl_prop_get_integer(dsname, "jailed", &zoned,
 			    setpoint))
 				return (SET_ERROR(EPERM));
 			if (!zoned || strlen(dsname) <= strlen(setpoint))
 				return (SET_ERROR(EPERM));
 		}
 		break;
 
 	case ZFS_PROP_MLSLABEL:
 #ifdef SECLABEL
 		if (!is_system_labeled())
 			return (SET_ERROR(EPERM));
 
 		if (nvpair_value_string(propval, &strval) == 0) {
 			int err;
 
 			err = zfs_set_slabel_policy(dsname, strval, CRED());
 			if (err != 0)
 				return (err);
 		}
 #else
 		return (EOPNOTSUPP);
 #endif
 		break;
 	}
 
 	return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
 	error = zfs_dozonecheck(zc->zc_name, cr);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * permission to set permissions will be evaluated later in
 	 * dsl_deleg_can_allow()
 	 */
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_ROLLBACK, cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	char *cp;
 	int error;
 
 	/*
 	 * Generate the current snapshot name from the given objsetid, then
 	 * use that name for the secpolicy/zone checks.
 	 */
 	cp = strchr(zc->zc_name, '@');
 	if (cp == NULL)
 		return (SET_ERROR(EINVAL));
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_name(ds, zc->zc_name);
 
 	error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
 	    ZFS_DELEG_PERM_SEND, cr);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_SEND, cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	vnode_t *vp;
 	int error;
 
 	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
 	    NO_FOLLOW, NULL, &vp)) != 0)
 		return (error);
 
 	/* Now make sure mntpnt and dataset are ZFS */
 
 	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
 	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
 	    zc->zc_name) != 0)) {
 		VN_RELE(vp);
 		return (SET_ERROR(EPERM));
 	}
 
 	VN_RELE(vp);
 	return (dsl_deleg_access(zc->zc_name,
 	    ZFS_DELEG_PERM_SHARE, cr));
 }
 
 int
 zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	if (!INGLOBALZONE(curthread))
 		return (SET_ERROR(EPERM));
 
 	if (secpolicy_nfs(cr) == 0) {
 		return (0);
 	} else {
 		return (zfs_secpolicy_deleg_share(zc, innvl, cr));
 	}
 }
 
 int
 zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	if (!INGLOBALZONE(curthread))
 		return (SET_ERROR(EPERM));
 
 	if (secpolicy_smb(cr) == 0) {
 		return (0);
 	} else {
 		return (zfs_secpolicy_deleg_share(zc, innvl, cr));
 	}
 }
 
 static int
 zfs_get_parent(const char *datasetname, char *parent, int parentsize)
 {
 	char *cp;
 
 	/*
 	 * Remove the @bla or /bla from the end of the name to get the parent.
 	 */
 	(void) strncpy(parent, datasetname, parentsize);
 	cp = strrchr(parent, '@');
 	if (cp != NULL) {
 		cp[0] = '\0';
 	} else {
 		cp = strrchr(parent, '/');
 		if (cp == NULL)
 			return (SET_ERROR(ENOENT));
 		cp[0] = '\0';
 	}
 
 	return (0);
 }
 
 int
 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
 {
 	int error;
 
 	if ((error = zfs_secpolicy_write_perms(name,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
 }
 
 /*
  * Destroying snapshots with delegated permissions requires
  * descendant mount and destroy permissions.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvlist_t *snaps;
 	nvpair_t *pair, *nextpair;
 	int error = 0;
 
 	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
 		return (SET_ERROR(EINVAL));
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nextpair) {
 		nextpair = nvlist_next_nvpair(snaps, pair);
 		error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
 		if (error == ENOENT) {
 			/*
 			 * Ignore any snapshots that don't exist (we consider
 			 * them "already destroyed").  Remove the name from the
 			 * nvl here in case the snapshot is created between
 			 * now and when we try to destroy it (in which case
 			 * we don't want to destroy it since we haven't
 			 * checked for permission).
 			 */
 			fnvlist_remove_nvpair(snaps, pair);
 			error = 0;
 		}
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 int
 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 {
 	char	parentname[MAXNAMELEN];
 	int	error;
 
 	if ((error = zfs_secpolicy_write_perms(from,
 	    ZFS_DELEG_PERM_RENAME, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(from,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_get_parent(to, parentname,
 	    sizeof (parentname))) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	char *at = NULL;
 	int error;
 
 	if ((zc->zc_cookie & 1) != 0) {
 		/*
 		 * This is recursive rename, so the starting snapshot might
 		 * not exist. Check file system or volume permission instead.
 		 */
 		at = strchr(zc->zc_name, '@');
 		if (at == NULL)
 			return (EINVAL);
 		*at = '\0';
 	}
 
 	error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr);
 
 	if (at != NULL)
 		*at = '@';
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *clone;
 	int error;
 
 	error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_PROMOTE, cr);
 	if (error != 0)
 		return (error);
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
 
 	if (error == 0) {
 		char parentname[MAXNAMELEN];
 		dsl_dataset_t *origin = NULL;
 		dsl_dir_t *dd;
 		dd = clone->ds_dir;
 
 		error = dsl_dataset_hold_obj(dd->dd_pool,
-		    dd->dd_phys->dd_origin_obj, FTAG, &origin);
+		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
 		if (error != 0) {
 			dsl_dataset_rele(clone, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
 		error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
 		    ZFS_DELEG_PERM_MOUNT, cr);
 
 		dsl_dataset_name(origin, parentname);
 		if (error == 0) {
 			error = zfs_secpolicy_write_perms_ds(parentname, origin,
 			    ZFS_DELEG_PERM_PROMOTE, cr);
 		}
 		dsl_dataset_rele(clone, FTAG);
 		dsl_dataset_rele(origin, FTAG);
 	}
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_CREATE, cr));
 }
 
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(name,
 	    ZFS_DELEG_PERM_SNAPSHOT, cr));
 }
 
 /*
  * Check for permission to create each snapshot in the nvlist.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvlist_t *snaps;
 	int error;
 	nvpair_t *pair;
 
 	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
 		return (SET_ERROR(EINVAL));
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		char *name = nvpair_name(pair);
 		char *atp = strchr(name, '@');
 
 		if (atp == NULL) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		*atp = '\0';
 		error = zfs_secpolicy_snapshot_perms(name, cr);
 		*atp = '@';
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Check for permission to create each snapshot in the nvlist.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error = 0;
 
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
 		char *name = nvpair_name(pair);
 		char *hashp = strchr(name, '#');
 
 		if (hashp == NULL) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		*hashp = '\0';
 		error = zfs_secpolicy_write_perms(name,
 		    ZFS_DELEG_PERM_BOOKMARK, cr);
 		*hashp = '#';
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvpair_t *pair, *nextpair;
 	int error = 0;
 
 	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
 	    pair = nextpair) {
 		char *name = nvpair_name(pair);
 		char *hashp = strchr(name, '#');
 		nextpair = nvlist_next_nvpair(innvl, pair);
 
 		if (hashp == NULL) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		*hashp = '\0';
 		error = zfs_secpolicy_write_perms(name,
 		    ZFS_DELEG_PERM_DESTROY, cr);
 		*hashp = '#';
 		if (error == ENOENT) {
 			/*
 			 * Ignore any filesystems that don't exist (we consider
 			 * their bookmarks "already destroyed").  Remove
 			 * the name from the nvl here in case the filesystem
 			 * is created between now and when we try to destroy
 			 * the bookmark (in which case we don't want to
 			 * destroy it since we haven't checked for permission).
 			 */
 			fnvlist_remove_nvpair(innvl, pair);
 			error = 0;
 		}
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	/*
 	 * Even root must have a proper TSD so that we know what pool
 	 * to log to.
 	 */
 	if (tsd_get(zfs_allow_log_key) == NULL)
 		return (SET_ERROR(EPERM));
 	return (0);
 }
 
 static int
 zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	char	parentname[MAXNAMELEN];
 	int	error;
 	char	*origin;
 
 	if ((error = zfs_get_parent(zc->zc_name, parentname,
 	    sizeof (parentname))) != 0)
 		return (error);
 
 	if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
 	    (error = zfs_secpolicy_write_perms(origin,
 	    ZFS_DELEG_PERM_CLONE, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
 		return (error);
 
 	return (zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_MOUNT, cr));
 }
 
 /*
  * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
  * SYS_CONFIG privilege, which is not available in a local zone.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	if (secpolicy_sys_config(cr, B_FALSE) != 0)
 		return (SET_ERROR(EPERM));
 
 	return (0);
 }
 
 /*
  * Policy for object to name lookups.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
 	if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
 		return (0);
 
 	error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
 	return (error);
 }
 
 /*
  * Policy for fault injection.  Requires all privileges.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (secpolicy_zinject(cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
 
 	if (prop == ZPROP_INVAL) {
 		if (!zfs_prop_user(zc->zc_value))
 			return (SET_ERROR(EINVAL));
 		return (zfs_secpolicy_write_perms(zc->zc_name,
 		    ZFS_DELEG_PERM_USERPROP, cr));
 	} else {
 		return (zfs_secpolicy_setprop(zc->zc_name, prop,
 		    NULL, cr));
 	}
 }
 
 static int
 zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int err = zfs_secpolicy_read(zc, innvl, cr);
 	if (err)
 		return (err);
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
 		return (SET_ERROR(EINVAL));
 
 	if (zc->zc_value[0] == 0) {
 		/*
 		 * They are asking about a posix uid/gid.  If it's
 		 * themself, allow it.
 		 */
 		if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
 		    zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
 			if (zc->zc_guid == crgetuid(cr))
 				return (0);
 		} else {
 			if (groupmember(zc->zc_guid, cr))
 				return (0);
 		}
 	}
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    userquota_perms[zc->zc_objset_type], cr));
 }
 
 static int
 zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int err = zfs_secpolicy_read(zc, innvl, cr);
 	if (err)
 		return (err);
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
 		return (SET_ERROR(EINVAL));
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    userquota_perms[zc->zc_objset_type], cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
 	    NULL, cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvpair_t *pair;
 	nvlist_t *holds;
 	int error;
 
 	error = nvlist_lookup_nvlist(innvl, "holds", &holds);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(holds, pair)) {
 		char fsname[MAXNAMELEN];
 		error = dmu_fsname(nvpair_name(pair), fsname);
 		if (error != 0)
 			return (error);
 		error = zfs_secpolicy_write_perms(fsname,
 		    ZFS_DELEG_PERM_HOLD, cr);
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvpair_t *pair;
 	int error;
 
 	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(innvl, pair)) {
 		char fsname[MAXNAMELEN];
 		error = dmu_fsname(nvpair_name(pair), fsname);
 		if (error != 0)
 			return (error);
 		error = zfs_secpolicy_write_perms(fsname,
 		    ZFS_DELEG_PERM_RELEASE, cr);
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Policy for allowing temporary snapshots to be taken or released
  */
 static int
 zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	/*
 	 * A temporary snapshot is the same as a snapshot,
 	 * hold, destroy and release all rolled into one.
 	 * Delegated diff alone is sufficient that we allow this.
 	 */
 	int error;
 
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_DIFF, cr)) == 0)
 		return (0);
 
 	error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
 	if (error == 0)
 		error = zfs_secpolicy_hold(zc, innvl, cr);
 	if (error == 0)
 		error = zfs_secpolicy_release(zc, innvl, cr);
 	if (error == 0)
 		error = zfs_secpolicy_destroy(zc, innvl, cr);
 	return (error);
 }
 
 /*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
 static int
 get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
 {
 	char *packed;
 	int error;
 	nvlist_t *list = NULL;
 
 	/*
 	 * Read in and unpack the user-supplied nvlist.
 	 */
 	if (size == 0)
 		return (SET_ERROR(EINVAL));
 
 	packed = kmem_alloc(size, KM_SLEEP);
 
 	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
 	    iflag)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
 
 	if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
 
 	kmem_free(packed, size);
 
 	*nvp = list;
 	return (0);
 }
 
 /*
  * Reduce the size of this nvlist until it can be serialized in 'max' bytes.
  * Entries will be removed from the end of the nvlist, and one int32 entry
  * named "N_MORE_ERRORS" will be added indicating how many entries were
  * removed.
  */
 static int
 nvlist_smush(nvlist_t *errors, size_t max)
 {
 	size_t size;
 
 	size = fnvlist_size(errors);
 
 	if (size > max) {
 		nvpair_t *more_errors;
 		int n = 0;
 
 		if (max < 1024)
 			return (SET_ERROR(ENOMEM));
 
 		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
 		more_errors = nvlist_prev_nvpair(errors, NULL);
 
 		do {
 			nvpair_t *pair = nvlist_prev_nvpair(errors,
 			    more_errors);
 			fnvlist_remove_nvpair(errors, pair);
 			n++;
 			size = fnvlist_size(errors);
 		} while (size > max);
 
 		fnvlist_remove_nvpair(errors, more_errors);
 		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
 		ASSERT3U(fnvlist_size(errors), <=, max);
 	}
 
 	return (0);
 }
 
 static int
 put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	char *packed = NULL;
 	int error = 0;
 	size_t size;
 
 	size = fnvlist_size(nvl);
 
 	if (size > zc->zc_nvlist_dst_size) {
 		/*
 		 * Solaris returns ENOMEM here, because even if an error is
 		 * returned from an ioctl(2), new zc_nvlist_dst_size will be
 		 * passed to the userland. This is not the case for FreeBSD.
 		 * We need to return 0, so the kernel will copy the
 		 * zc_nvlist_dst_size back and the userland can discover that a
 		 * bigger buffer is needed.
 		 */
 		error = 0;
 	} else {
 		packed = fnvlist_pack(nvl, &size);
 		if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
 		    size, zc->zc_iflags) != 0)
 			error = SET_ERROR(EFAULT);
 		fnvlist_pack_free(packed, size);
 	}
 
 	zc->zc_nvlist_dst_size = size;
 	zc->zc_nvlist_dst_filled = B_TRUE;
 	return (error);
 }
 
 static int
 getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold(dsname, FTAG, &os);
 	if (error != 0)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	mutex_enter(&os->os_user_ptr_lock);
 	*zfvp = dmu_objset_get_user(os);
 	if (*zfvp) {
 		VFS_HOLD((*zfvp)->z_vfs);
 	} else {
 		error = SET_ERROR(ESRCH);
 	}
 	mutex_exit(&os->os_user_ptr_lock);
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 /*
  * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
  * case its z_vfs will be NULL, and it will be opened as the owner.
  * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
  * which prevents all vnode ops from running.
  */
 static int
 zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
 {
 	int error = 0;
 
 	if (getzfsvfs(name, zfvp) != 0)
 		error = zfsvfs_create(name, zfvp);
 	if (error == 0) {
 		rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
 		    RW_READER, tag);
 		if ((*zfvp)->z_unmounted) {
 			/*
 			 * XXX we could probably try again, since the unmounting
 			 * thread should be just about to disassociate the
 			 * objset from the zfsvfs.
 			 */
 			rrm_exit(&(*zfvp)->z_teardown_lock, tag);
 			return (SET_ERROR(EBUSY));
 		}
 	}
 	return (error);
 }
 
 static void
 zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
 {
 	rrm_exit(&zfsvfs->z_teardown_lock, tag);
 
 	if (zfsvfs->z_vfs) {
 		VFS_RELE(zfsvfs->z_vfs);
 	} else {
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	}
 }
 
 static int
 zfs_ioc_pool_create(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *config, *props = NULL;
 	nvlist_t *rootprops = NULL;
 	nvlist_t *zplprops = NULL;
 
 	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config))
 		return (error);
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
 	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))) {
 		nvlist_free(config);
 		return (error);
 	}
 
 	if (props) {
 		nvlist_t *nvl = NULL;
 		uint64_t version = SPA_VERSION;
 
 		(void) nvlist_lookup_uint64(props,
 		    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
 		if (!SPA_VERSION_IS_SUPPORTED(version)) {
 			error = SET_ERROR(EINVAL);
 			goto pool_props_bad;
 		}
 		(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
 		if (nvl) {
 			error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
 			if (error != 0) {
 				nvlist_free(config);
 				nvlist_free(props);
 				return (error);
 			}
 			(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
 		}
 		VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		error = zfs_fill_zplprops_root(version, rootprops,
 		    zplprops, NULL);
 		if (error != 0)
 			goto pool_props_bad;
 	}
 
 	error = spa_create(zc->zc_name, config, props, zplprops);
 
 	/*
 	 * Set the remaining root properties
 	 */
 	if (!error && (error = zfs_set_prop_nvlist(zc->zc_name,
 	    ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
 		(void) spa_destroy(zc->zc_name);
 
 pool_props_bad:
 	nvlist_free(rootprops);
 	nvlist_free(zplprops);
 	nvlist_free(config);
 	nvlist_free(props);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_destroy(zfs_cmd_t *zc)
 {
 	int error;
 	zfs_log_history(zc);
 	error = spa_destroy(zc->zc_name);
 	if (error == 0)
 		zvol_remove_minors(zc->zc_name);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_import(zfs_cmd_t *zc)
 {
 	nvlist_t *config, *props = NULL;
 	uint64_t guid;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)) != 0)
 		return (error);
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
 	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))) {
 		nvlist_free(config);
 		return (error);
 	}
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
 	    guid != zc->zc_guid)
 		error = SET_ERROR(EINVAL);
 	else
 		error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
 
 	if (zc->zc_nvlist_dst != 0) {
 		int err;
 
 		if ((err = put_nvlist(zc, config)) != 0)
 			error = err;
 	}
 
 	nvlist_free(config);
 
 	if (props)
 		nvlist_free(props);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_export(zfs_cmd_t *zc)
 {
 	int error;
 	boolean_t force = (boolean_t)zc->zc_cookie;
 	boolean_t hardforce = (boolean_t)zc->zc_guid;
 
 	zfs_log_history(zc);
 	error = spa_export(zc->zc_name, NULL, force, hardforce);
 	if (error == 0)
 		zvol_remove_minors(zc->zc_name);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_configs(zfs_cmd_t *zc)
 {
 	nvlist_t *configs;
 	int error;
 
 	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
 		return (SET_ERROR(EEXIST));
 
 	error = put_nvlist(zc, configs);
 
 	nvlist_free(configs);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of the pool
  *
  * outputs:
  * zc_cookie		real errno
  * zc_nvlist_dst	config nvlist
  * zc_nvlist_dst_size	size of config nvlist
  */
 static int
 zfs_ioc_pool_stats(zfs_cmd_t *zc)
 {
 	nvlist_t *config;
 	int error;
 	int ret = 0;
 
 	error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
 	    sizeof (zc->zc_value));
 
 	if (config != NULL) {
 		ret = put_nvlist(zc, config);
 		nvlist_free(config);
 
 		/*
 		 * The config may be present even if 'error' is non-zero.
 		 * In this case we return success, and preserve the real errno
 		 * in 'zc_cookie'.
 		 */
 		zc->zc_cookie = error;
 	} else {
 		ret = error;
 	}
 
 	return (ret);
 }
 
 /*
  * Try to import the given pool, returning pool stats as appropriate so that
  * user land knows which devices are available and overall pool health.
  */
 static int
 zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 {
 	nvlist_t *tryconfig, *config;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &tryconfig)) != 0)
 		return (error);
 
 	config = spa_tryimport(tryconfig);
 
 	nvlist_free(tryconfig);
 
 	if (config == NULL)
 		return (SET_ERROR(EINVAL));
 
 	error = put_nvlist(zc, config);
 	nvlist_free(config);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name              name of the pool
  * zc_cookie            scan func (pool_scan_func_t)
  */
 static int
 zfs_ioc_pool_scan(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (zc->zc_cookie == POOL_SCAN_NONE)
 		error = spa_scan_stop(spa);
 	else
 		error = spa_scan(spa, zc->zc_cookie);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_freeze(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
 		spa_freeze(spa);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 static int
 zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (zc->zc_cookie < spa_version(spa) ||
 	    !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	spa_upgrade(spa, zc->zc_cookie);
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *hist_buf;
 	uint64_t size;
 	int error;
 
 	if ((size = zc->zc_history_len) == 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	hist_buf = kmem_alloc(size, KM_SLEEP);
 	if ((error = spa_history_get(spa, &zc->zc_history_offset,
 	    &zc->zc_history_len, hist_buf)) == 0) {
 		error = ddi_copyout(hist_buf,
 		    (void *)(uintptr_t)zc->zc_history,
 		    zc->zc_history_len, zc->zc_iflags);
 	}
 
 	spa_close(spa, FTAG);
 	kmem_free(hist_buf, size);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_reguid(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
 		error = spa_change_guid(spa);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 static int
 zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
 {
 	return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_obj		object to find
  *
  * outputs:
  * zc_value		name of object
  */
 static int
 zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	/* XXX reading from objset not owned */
 	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
 	    sizeof (zc->zc_value));
 	dmu_objset_rele(os, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_obj		object to find
  *
  * outputs:
  * zc_stat		stats on object
  * zc_value		path to object
  */
 static int
 zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	/* XXX reading from objset not owned */
 	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
 	    sizeof (zc->zc_value));
 	dmu_objset_rele(os, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_add(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *config, **l2cache, **spares;
 	uint_t nl2cache = 0, nspares = 0;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config);
 	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache);
 
 	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares);
 
 #ifdef illumos
 	/*
 	 * A root pool with concatenated devices is not supported.
 	 * Thus, can not add a device to a root pool.
 	 *
 	 * Intent log device can not be added to a rootpool because
 	 * during mountroot, zil is replayed, a seperated log device
 	 * can not be accessed during the mountroot time.
 	 *
 	 * l2cache and spare devices are ok to be added to a rootpool.
 	 */
 	if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
 		nvlist_free(config);
 		spa_close(spa, FTAG);
 		return (SET_ERROR(EDOM));
 	}
 #endif /* illumos */
 
 	if (error == 0) {
 		error = spa_vdev_add(spa, config);
 		nvlist_free(config);
 	}
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of the pool
  * zc_nvlist_conf	nvlist of devices to remove
  * zc_cookie		to stop the remove?
  */
 static int
 zfs_ioc_vdev_remove(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 	switch (zc->zc_cookie) {
 	case VDEV_STATE_ONLINE:
 		error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
 		break;
 
 	case VDEV_STATE_OFFLINE:
 		error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	case VDEV_STATE_FAULTED:
 		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
 		    zc->zc_obj != VDEV_AUX_EXTERNAL)
 			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
 
 		error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	case VDEV_STATE_DEGRADED:
 		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
 		    zc->zc_obj != VDEV_AUX_EXTERNAL)
 			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
 
 		error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	default:
 		error = SET_ERROR(EINVAL);
 	}
 	zc->zc_cookie = newstate;
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int replacing = zc->zc_cookie;
 	nvlist_t *config;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)) == 0) {
 		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
 		nvlist_free(config);
 	}
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_split(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	nvlist_t *config, *props = NULL;
 	int error;
 	boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)) {
 		spa_close(spa, FTAG);
 		return (error);
 	}
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
 	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))) {
 		spa_close(spa, FTAG);
 		nvlist_free(config);
 		return (error);
 	}
 
 	error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
 
 	spa_close(spa, FTAG);
 
 	nvlist_free(config);
 	nvlist_free(props);
 
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *path = zc->zc_value;
 	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = spa_vdev_setpath(spa, guid, path);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *fru = zc->zc_value;
 	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = spa_vdev_setfru(spa, guid, fru);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 {
 	int error = 0;
 	nvlist_t *nv;
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	if (zc->zc_nvlist_dst != 0 &&
 	    (error = dsl_prop_get_all(os, &nv)) == 0) {
 		dmu_objset_stats(os, nv);
 		/*
 		 * NB: zvol_get_stats() will read the objset contents,
 		 * which we aren't supposed to do with a
 		 * DS_MODE_USER hold, because it could be
 		 * inconsistent.  So this is a bit of a workaround...
 		 * XXX reading with out owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
 		    dmu_objset_type(os) == DMU_OST_ZVOL) {
 			error = zvol_get_stats(os, nv);
 			if (error == EIO)
 				return (error);
 			VERIFY0(error);
 		}
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_dst_size	size of buffer for property nvlist
  *
  * outputs:
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
  */
 static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error == 0) {
 		error = zfs_ioc_objset_stats_impl(zc, os);
 		dmu_objset_rele(os, FTAG);
 	}
 
 	if (error == ENOMEM)
 		error = 0;
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_dst_size	size of buffer for property nvlist
  *
  * outputs:
  * zc_nvlist_dst	received property nvlist
  * zc_nvlist_dst_size	size of received property nvlist
  *
  * Gets received properties (distinct from local properties on or after
  * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
  * local property values.
  */
 static int
 zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
 {
 	int error = 0;
 	nvlist_t *nv;
 
 	/*
 	 * Without this check, we would return local property values if the
 	 * caller has not already received properties on or after
 	 * SPA_VERSION_RECVD_PROPS.
 	 */
 	if (!dsl_prop_get_hasrecvd(zc->zc_name))
 		return (SET_ERROR(ENOTSUP));
 
 	if (zc->zc_nvlist_dst != 0 &&
 	    (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
 	return (error);
 }
 
 static int
 nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
 {
 	uint64_t value;
 	int error;
 
 	/*
 	 * zfs_get_zplprop() will either find a value or give us
 	 * the default value (if there is one).
 	 */
 	if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
 		return (error);
 	VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
 	return (0);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_dst_size	size of buffer for zpl property nvlist
  *
  * outputs:
  * zc_nvlist_dst	zpl property nvlist
  * zc_nvlist_dst_size	size of zpl property nvlist
  */
 static int
 zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int err;
 
 	/* XXX reading without owning */
 	if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
 		return (err);
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	/*
 	 * NB: nvl_add_zplprop() will read the objset contents,
 	 * which we aren't supposed to do with a DS_MODE_USER
 	 * hold, because it could be inconsistent.
 	 */
 	if (zc->zc_nvlist_dst != 0 &&
 	    !zc->zc_objset_stats.dds_inconsistent &&
 	    dmu_objset_type(os) == DMU_OST_ZFS) {
 		nvlist_t *nv;
 
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
 			err = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	} else {
 		err = SET_ERROR(ENOENT);
 	}
 	dmu_objset_rele(os, FTAG);
 	return (err);
 }
 
 boolean_t
 dataset_name_hidden(const char *name)
 {
 	/*
 	 * Skip over datasets that are not visible in this zone,
 	 * internal datasets (which have a $ in their name), and
 	 * temporary datasets (which have a % in their name).
 	 */
 	if (strchr(name, '$') != NULL)
 		return (B_TRUE);
 	if (strchr(name, '%') != NULL)
 		return (B_TRUE);
 	if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_nvlist_dst_size	size of buffer for property nvlist
  *
  * outputs:
  * zc_name		name of next filesystem
  * zc_cookie		zap cursor
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
  */
 static int
 zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 	char *p;
 	size_t orig_len = strlen(zc->zc_name);
 
 top:
 	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
 		if (error == ENOENT)
 			error = SET_ERROR(ESRCH);
 		return (error);
 	}
 
 	p = strrchr(zc->zc_name, '/');
 	if (p == NULL || p[1] != '\0')
 		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
 	p = zc->zc_name + strlen(zc->zc_name);
 
 	do {
 		error = dmu_dir_list_next(os,
 		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = SET_ERROR(ESRCH);
 	} while (error == 0 && dataset_name_hidden(zc->zc_name));
 	dmu_objset_rele(os, FTAG);
 
 	/*
 	 * If it's an internal dataset (ie. with a '$' in its name),
 	 * don't try to get stats for it, otherwise we'll return ENOENT.
 	 */
 	if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 		if (error == ENOENT) {
 			/* We lost a race with destroy, get the next one. */
 			zc->zc_name[orig_len] = '\0';
 			goto top;
 		}
 	}
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_nvlist_dst_size	size of buffer for property nvlist
  * zc_simple		when set, only name is requested
  *
  * outputs:
  * zc_name		name of next snapshot
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
  */
 static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0) {
 		return (error == ENOENT ? ESRCH : error);
 	}
 
 	/*
 	 * A dataset name of maximum length cannot have any snapshots,
 	 * so exit immediately.
 	 */
 	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(ESRCH));
 	}
 
 	error = dmu_snapshot_list_next(os,
 	    sizeof (zc->zc_name) - strlen(zc->zc_name),
 	    zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie,
 	    NULL);
 
 	if (error == 0 && !zc->zc_simple) {
 		dsl_dataset_t *ds;
 		dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
 
 		error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
 		if (error == 0) {
 			objset_t *ossnap;
 
 			error = dmu_objset_from_ds(ds, &ossnap);
 			if (error == 0)
 				error = zfs_ioc_objset_stats_impl(zc, ossnap);
 			dsl_dataset_rele(ds, FTAG);
 		}
 	} else if (error == ENOENT) {
 		error = SET_ERROR(ESRCH);
 	}
 
 	dmu_objset_rele(os, FTAG);
 	/* if we failed, undo the @ that we tacked on to zc_name */
 	if (error != 0)
 		*strchr(zc->zc_name, '@') = '\0';
 	return (error);
 }
 
 static int
 zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
 {
 	const char *propname = nvpair_name(pair);
 	uint64_t *valary;
 	unsigned int vallen;
 	const char *domain;
 	char *dash;
 	zfs_userquota_prop_t type;
 	uint64_t rid;
 	uint64_t quota;
 	zfsvfs_t *zfsvfs;
 	int err;
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
 		if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &pair) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * A correctly constructed propname is encoded as
 	 * userquota@<rid>-<domain>.
 	 */
 	if ((dash = strchr(propname, '-')) == NULL ||
 	    nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
 	    vallen != 3)
 		return (SET_ERROR(EINVAL));
 
 	domain = dash + 1;
 	type = valary[0];
 	rid = valary[1];
 	quota = valary[2];
 
 	err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
 	if (err == 0) {
 		err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
 		zfsvfs_rele(zfsvfs, FTAG);
 	}
 
 	return (err);
 }
 
 /*
  * If the named property is one that has a special function to set its value,
  * return 0 on success and a positive error code on failure; otherwise if it is
  * not one of the special properties handled by this function, return -1.
  *
  * XXX: It would be better for callers of the property interface if we handled
  * these special cases in dsl_prop.c (in the dsl layer).
  */
 static int
 zfs_prop_set_special(const char *dsname, zprop_source_t source,
     nvpair_t *pair)
 {
 	const char *propname = nvpair_name(pair);
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	uint64_t intval;
 	int err = -1;
 
 	if (prop == ZPROP_INVAL) {
 		if (zfs_prop_userquota(propname))
 			return (zfs_prop_set_userquota(dsname, pair));
 		return (-1);
 	}
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
 		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &pair) == 0);
 	}
 
 	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
 		return (-1);
 
 	VERIFY(0 == nvpair_value_uint64(pair, &intval));
 
 	switch (prop) {
 	case ZFS_PROP_QUOTA:
 		err = dsl_dir_set_quota(dsname, source, intval);
 		break;
 	case ZFS_PROP_REFQUOTA:
 		err = dsl_dataset_set_refquota(dsname, source, intval);
 		break;
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 		if (intval == UINT64_MAX) {
 			/* clearing the limit, just do it */
 			err = 0;
 		} else {
 			err = dsl_dir_activate_fs_ss_limit(dsname);
 		}
 		/*
 		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
 		 * default path to set the value in the nvlist.
 		 */
 		if (err == 0)
 			err = -1;
 		break;
 	case ZFS_PROP_RESERVATION:
 		err = dsl_dir_set_reservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_REFRESERVATION:
 		err = dsl_dataset_set_refreservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_VOLSIZE:
 		err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip),
 		    intval);
 		break;
 	case ZFS_PROP_VERSION:
 	{
 		zfsvfs_t *zfsvfs;
 
 		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
 			break;
 
 		err = zfs_set_version(zfsvfs, intval);
 		zfsvfs_rele(zfsvfs, FTAG);
 
 		if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
 			zfs_cmd_t *zc;
 
 			zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 			(void) strcpy(zc->zc_name, dsname);
 			(void) zfs_ioc_userspace_upgrade(zc);
 			kmem_free(zc, sizeof (zfs_cmd_t));
 		}
 		break;
 	}
 	default:
 		err = -1;
 	}
 
 	return (err);
 }
 
 /*
  * This function is best effort. If it fails to set any of the given properties,
  * it continues to set as many as it can and returns the last error
  * encountered. If the caller provides a non-NULL errlist, it will be filled in
  * with the list of names of all the properties that failed along with the
  * corresponding error numbers.
  *
  * If every property is set successfully, zero is returned and errlist is not
  * modified.
  */
 int
 zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
     nvlist_t *errlist)
 {
 	nvpair_t *pair;
 	nvpair_t *propval;
 	int rv = 0;
 	uint64_t intval;
 	char *strval;
 	nvlist_t *genericnvl = fnvlist_alloc();
 	nvlist_t *retrynvl = fnvlist_alloc();
 
 retry:
 	pair = NULL;
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
 		zfs_prop_t prop = zfs_name_to_prop(propname);
 		int err = 0;
 
 		/* decode the property value */
 		propval = pair;
 		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 			nvlist_t *attrs;
 			attrs = fnvpair_value_nvlist(pair);
 			if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 			    &propval) != 0)
 				err = SET_ERROR(EINVAL);
 		}
 
 		/* Validate value type */
 		if (err == 0 && prop == ZPROP_INVAL) {
 			if (zfs_prop_user(propname)) {
 				if (nvpair_type(propval) != DATA_TYPE_STRING)
 					err = SET_ERROR(EINVAL);
 			} else if (zfs_prop_userquota(propname)) {
 				if (nvpair_type(propval) !=
 				    DATA_TYPE_UINT64_ARRAY)
 					err = SET_ERROR(EINVAL);
 			} else {
 				err = SET_ERROR(EINVAL);
 			}
 		} else if (err == 0) {
 			if (nvpair_type(propval) == DATA_TYPE_STRING) {
 				if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
 					err = SET_ERROR(EINVAL);
 			} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
 				const char *unused;
 
 				intval = fnvpair_value_uint64(propval);
 
 				switch (zfs_prop_get_type(prop)) {
 				case PROP_TYPE_NUMBER:
 					break;
 				case PROP_TYPE_STRING:
 					err = SET_ERROR(EINVAL);
 					break;
 				case PROP_TYPE_INDEX:
 					if (zfs_prop_index_to_string(prop,
 					    intval, &unused) != 0)
 						err = SET_ERROR(EINVAL);
 					break;
 				default:
 					cmn_err(CE_PANIC,
 					    "unknown property type");
 				}
 			} else {
 				err = SET_ERROR(EINVAL);
 			}
 		}
 
 		/* Validate permissions */
 		if (err == 0)
 			err = zfs_check_settable(dsname, pair, CRED());
 
 		if (err == 0) {
 			err = zfs_prop_set_special(dsname, source, pair);
 			if (err == -1) {
 				/*
 				 * For better performance we build up a list of
 				 * properties to set in a single transaction.
 				 */
 				err = nvlist_add_nvpair(genericnvl, pair);
 			} else if (err != 0 && nvl != retrynvl) {
 				/*
 				 * This may be a spurious error caused by
 				 * receiving quota and reservation out of order.
 				 * Try again in a second pass.
 				 */
 				err = nvlist_add_nvpair(retrynvl, pair);
 			}
 		}
 
 		if (err != 0) {
 			if (errlist != NULL)
 				fnvlist_add_int32(errlist, propname, err);
 			rv = err;
 		}
 	}
 
 	if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
 		nvl = retrynvl;
 		goto retry;
 	}
 
 	if (!nvlist_empty(genericnvl) &&
 	    dsl_props_set(dsname, source, genericnvl) != 0) {
 		/*
 		 * If this fails, we still want to set as many properties as we
 		 * can, so try setting them individually.
 		 */
 		pair = NULL;
 		while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
 			const char *propname = nvpair_name(pair);
 			int err = 0;
 
 			propval = pair;
 			if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 				nvlist_t *attrs;
 				attrs = fnvpair_value_nvlist(pair);
 				propval = fnvlist_lookup_nvpair(attrs,
 				    ZPROP_VALUE);
 			}
 
 			if (nvpair_type(propval) == DATA_TYPE_STRING) {
 				strval = fnvpair_value_string(propval);
 				err = dsl_prop_set_string(dsname, propname,
 				    source, strval);
 			} else {
 				intval = fnvpair_value_uint64(propval);
 				err = dsl_prop_set_int(dsname, propname, source,
 				    intval);
 			}
 
 			if (err != 0) {
 				if (errlist != NULL) {
 					fnvlist_add_int32(errlist, propname,
 					    err);
 				}
 				rv = err;
 			}
 		}
 	}
 	nvlist_free(genericnvl);
 	nvlist_free(retrynvl);
 
 	return (rv);
 }
 
 /*
  * Check that all the properties are valid user properties.
  */
 static int
 zfs_check_userprops(const char *fsname, nvlist_t *nvl)
 {
 	nvpair_t *pair = NULL;
 	int error = 0;
 
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
 
 		if (!zfs_prop_user(propname) ||
 		    nvpair_type(pair) != DATA_TYPE_STRING)
 			return (SET_ERROR(EINVAL));
 
 		if (error = zfs_secpolicy_write_perms(fsname,
 		    ZFS_DELEG_PERM_USERPROP, CRED()))
 			return (error);
 
 		if (strlen(propname) >= ZAP_MAXNAMELEN)
 			return (SET_ERROR(ENAMETOOLONG));
 
 		if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
 			return (E2BIG);
 	}
 	return (0);
 }
 
 static void
 props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
 {
 	nvpair_t *pair;
 
 	VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	pair = NULL;
 	while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
 		if (nvlist_exists(skipped, nvpair_name(pair)))
 			continue;
 
 		VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
 	}
 }
 
 static int
 clear_received_props(const char *dsname, nvlist_t *props,
     nvlist_t *skipped)
 {
 	int err = 0;
 	nvlist_t *cleared_props = NULL;
 	props_skip(props, skipped, &cleared_props);
 	if (!nvlist_empty(cleared_props)) {
 		/*
 		 * Acts on local properties until the dataset has received
 		 * properties at least once on or after SPA_VERSION_RECVD_PROPS.
 		 */
 		zprop_source_t flags = (ZPROP_SRC_NONE |
 		    (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
 		err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
 	}
 	nvlist_free(cleared_props);
 	return (err);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_value		name of property to set
  * zc_nvlist_src{_size}	nvlist of properties to apply
  * zc_cookie		received properties flag
  *
  * outputs:
  * zc_nvlist_dst{_size} error for each unapplied received property
  */
 static int
 zfs_ioc_set_prop(zfs_cmd_t *zc)
 {
 	nvlist_t *nvl;
 	boolean_t received = zc->zc_cookie;
 	zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
 	    ZPROP_SRC_LOCAL);
 	nvlist_t *errors;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &nvl)) != 0)
 		return (error);
 
 	if (received) {
 		nvlist_t *origprops;
 
 		if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
 			(void) clear_received_props(zc->zc_name,
 			    origprops, nvl);
 			nvlist_free(origprops);
 		}
 
 		error = dsl_prop_set_hasrecvd(zc->zc_name);
 	}
 
 	errors = fnvlist_alloc();
 	if (error == 0)
 		error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
 
 	if (zc->zc_nvlist_dst != 0 && errors != NULL) {
 		(void) put_nvlist(zc, errors);
 	}
 
 	nvlist_free(errors);
 	nvlist_free(nvl);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_value		name of property to inherit
  * zc_cookie		revert to received value if TRUE
  *
  * outputs:		none
  */
 static int
 zfs_ioc_inherit_prop(zfs_cmd_t *zc)
 {
 	const char *propname = zc->zc_value;
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	boolean_t received = zc->zc_cookie;
 	zprop_source_t source = (received
 	    ? ZPROP_SRC_NONE		/* revert to received value, if any */
 	    : ZPROP_SRC_INHERITED);	/* explicitly inherit */
 
 	if (received) {
 		nvlist_t *dummy;
 		nvpair_t *pair;
 		zprop_type_t type;
 		int err;
 
 		/*
 		 * zfs_prop_set_special() expects properties in the form of an
 		 * nvpair with type info.
 		 */
 		if (prop == ZPROP_INVAL) {
 			if (!zfs_prop_user(propname))
 				return (SET_ERROR(EINVAL));
 
 			type = PROP_TYPE_STRING;
 		} else if (prop == ZFS_PROP_VOLSIZE ||
 		    prop == ZFS_PROP_VERSION) {
 			return (SET_ERROR(EINVAL));
 		} else {
 			type = zfs_prop_get_type(prop);
 		}
 
 		VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		switch (type) {
 		case PROP_TYPE_STRING:
 			VERIFY(0 == nvlist_add_string(dummy, propname, ""));
 			break;
 		case PROP_TYPE_NUMBER:
 		case PROP_TYPE_INDEX:
 			VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
 			break;
 		default:
 			nvlist_free(dummy);
 			return (SET_ERROR(EINVAL));
 		}
 
 		pair = nvlist_next_nvpair(dummy, NULL);
 		err = zfs_prop_set_special(zc->zc_name, source, pair);
 		nvlist_free(dummy);
 		if (err != -1)
 			return (err); /* special property already handled */
 	} else {
 		/*
 		 * Only check this in the non-received case. We want to allow
 		 * 'inherit -S' to revert non-inheritable properties like quota
 		 * and reservation to the received or default values even though
 		 * they are not considered inheritable.
 		 */
 		if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
 			return (SET_ERROR(EINVAL));
 	}
 
 	/* property name has been validated by zfs_secpolicy_inherit_prop() */
 	return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
 }
 
 static int
 zfs_ioc_pool_set_props(zfs_cmd_t *zc)
 {
 	nvlist_t *props;
 	spa_t *spa;
 	int error;
 	nvpair_t *pair;
 
 	if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))
 		return (error);
 
 	/*
 	 * If the only property is the configfile, then just do a spa_lookup()
 	 * to handle the faulted case.
 	 */
 	pair = nvlist_next_nvpair(props, NULL);
 	if (pair != NULL && strcmp(nvpair_name(pair),
 	    zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
 	    nvlist_next_nvpair(props, pair) == NULL) {
 		mutex_enter(&spa_namespace_lock);
 		if ((spa = spa_lookup(zc->zc_name)) != NULL) {
 			spa_configfile_set(spa, props, B_FALSE);
 			spa_config_sync(spa, B_FALSE, B_TRUE);
 		}
 		mutex_exit(&spa_namespace_lock);
 		if (spa != NULL) {
 			nvlist_free(props);
 			return (0);
 		}
 	}
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
 		nvlist_free(props);
 		return (error);
 	}
 
 	error = spa_prop_set(spa, props);
 
 	nvlist_free(props);
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_get_props(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *nvp = NULL;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
 		/*
 		 * If the pool is faulted, there may be properties we can still
 		 * get (such as altroot and cachefile), so attempt to get them
 		 * anyway.
 		 */
 		mutex_enter(&spa_namespace_lock);
 		if ((spa = spa_lookup(zc->zc_name)) != NULL)
 			error = spa_prop_get(spa, &nvp);
 		mutex_exit(&spa_namespace_lock);
 	} else {
 		error = spa_prop_get(spa, &nvp);
 		spa_close(spa, FTAG);
 	}
 
 	if (error == 0 && zc->zc_nvlist_dst != 0)
 		error = put_nvlist(zc, nvp);
 	else
 		error = SET_ERROR(EFAULT);
 
 	nvlist_free(nvp);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_src{_size}	nvlist of delegated permissions
  * zc_perm_action	allow/unallow flag
  *
  * outputs:		none
  */
 static int
 zfs_ioc_set_fsacl(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *fsaclnv = NULL;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &fsaclnv)) != 0)
 		return (error);
 
 	/*
 	 * Verify nvlist is constructed correctly
 	 */
 	if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
 		nvlist_free(fsaclnv);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If we don't have PRIV_SYS_MOUNT, then validate
 	 * that user is allowed to hand out each permission in
 	 * the nvlist(s)
 	 */
 
 	error = secpolicy_zfs(CRED());
 	if (error != 0) {
 		if (zc->zc_perm_action == B_FALSE) {
 			error = dsl_deleg_can_allow(zc->zc_name,
 			    fsaclnv, CRED());
 		} else {
 			error = dsl_deleg_can_unallow(zc->zc_name,
 			    fsaclnv, CRED());
 		}
 	}
 
 	if (error == 0)
 		error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
 
 	nvlist_free(fsaclnv);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  *
  * outputs:
  * zc_nvlist_src{_size}	nvlist of delegated permissions
  */
 static int
 zfs_ioc_get_fsacl(zfs_cmd_t *zc)
 {
 	nvlist_t *nvp;
 	int error;
 
 	if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
 		error = put_nvlist(zc, nvp);
 		nvlist_free(nvp);
 	}
 
 	return (error);
 }
 
 /*
  * Search the vfs list for a specified resource.  Returns a pointer to it
  * or NULL if no suitable entry is found. The caller of this routine
  * is responsible for releasing the returned vfs pointer.
  */
 static vfs_t *
 zfs_get_vfs(const char *resource)
 {
 	vfs_t *vfsp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
 		if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
 			VFS_HOLD(vfsp);
 			break;
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return (vfsp);
 }
 
 /* ARGSUSED */
 static void
 zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	zfs_creat_t *zct = arg;
 
 	zfs_create_fs(os, cr, zct->zct_zplprops, tx);
 }
 
 #define	ZFS_PROP_UNDEFINED	((uint64_t)-1)
 
 /*
  * inputs:
  * os			parent objset pointer (NULL if root fs)
  * fuids_ok		fuids allowed in this version of the spa?
  * sa_ok		SAs allowed in this version of the spa?
  * createprops		list of properties requested by creator
  *
  * outputs:
  * zplprops	values for the zplprops we attach to the master node object
  * is_ci	true if requested file system will be purely case-insensitive
  *
  * Determine the settings for utf8only, normalization and
  * casesensitivity.  Specific values may have been requested by the
  * creator and/or we can inherit values from the parent dataset.  If
  * the file system is of too early a vintage, a creator can not
  * request settings for these properties, even if the requested
  * setting is the default value.  We don't actually want to create dsl
  * properties for these, so remove them from the source nvlist after
  * processing.
  */
 static int
 zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
     boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
 
 	ASSERT(zplprops != NULL);
 
 	/*
 	 * Pull out creator prop choices, if any.
 	 */
 	if (createprops) {
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_NORMALIZE));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_CASE), &sense);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_CASE));
 	}
 
 	/*
 	 * If the zpl version requested is whacky or the file system
 	 * or pool is version is too "young" to support normalization
 	 * and the creator tried to set a value for one of the props,
 	 * error out.
 	 */
 	if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
 	    (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
 	    (zplver >= ZPL_VERSION_SA && !sa_ok) ||
 	    (zplver < ZPL_VERSION_NORMALIZATION &&
 	    (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
 	    sense != ZFS_PROP_UNDEFINED)))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Put the version in the zplprops
 	 */
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
 
 	if (norm == ZFS_PROP_UNDEFINED)
 		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
 
 	/*
 	 * If we're normalizing, names must always be valid UTF-8 strings.
 	 */
 	if (norm)
 		u8 = 1;
 	if (u8 == ZFS_PROP_UNDEFINED)
 		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
 
 	if (sense == ZFS_PROP_UNDEFINED)
 		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
 
 	if (is_ci)
 		*is_ci = (sense == ZFS_CASE_INSENSITIVE);
 
 	return (0);
 }
 
 static int
 zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
 	boolean_t fuids_ok, sa_ok;
 	uint64_t zplver = ZPL_VERSION;
 	objset_t *os = NULL;
 	char parentname[MAXNAMELEN];
 	char *cp;
 	spa_t *spa;
 	uint64_t spa_vers;
 	int error;
 
 	(void) strlcpy(parentname, dataset, sizeof (parentname));
 	cp = strrchr(parentname, '/');
 	ASSERT(cp != NULL);
 	cp[0] = '\0';
 
 	if ((error = spa_open(dataset, &spa, FTAG)) != 0)
 		return (error);
 
 	spa_vers = spa_version(spa);
 	spa_close(spa, FTAG);
 
 	zplver = zfs_zpl_version_map(spa_vers);
 	fuids_ok = (zplver >= ZPL_VERSION_FUID);
 	sa_ok = (zplver >= ZPL_VERSION_SA);
 
 	/*
 	 * Open parent object set so we can inherit zplprop values.
 	 */
 	if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
 		return (error);
 
 	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
 	    zplprops, is_ci);
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 static int
 zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
 	boolean_t fuids_ok;
 	boolean_t sa_ok;
 	uint64_t zplver = ZPL_VERSION;
 	int error;
 
 	zplver = zfs_zpl_version_map(spa_vers);
 	fuids_ok = (zplver >= ZPL_VERSION_FUID);
 	sa_ok = (zplver >= ZPL_VERSION_SA);
 
 	error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
 	    createprops, zplprops, is_ci);
 	return (error);
 }
 
 /*
  * innvl: {
  *     "type" -> dmu_objset_type_t (int32)
  *     (optional) "props" -> { prop -> value }
  * }
  *
  * outnvl: propname -> error code (int32)
  */
 static int
 zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error = 0;
 	zfs_creat_t zct = { 0 };
 	nvlist_t *nvprops = NULL;
 	void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 	int32_t type32;
 	dmu_objset_type_t type;
 	boolean_t is_insensitive = B_FALSE;
 
 	if (nvlist_lookup_int32(innvl, "type", &type32) != 0)
 		return (SET_ERROR(EINVAL));
 	type = type32;
 	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
 
 	switch (type) {
 	case DMU_OST_ZFS:
 		cbfunc = zfs_create_cb;
 		break;
 
 	case DMU_OST_ZVOL:
 		cbfunc = zvol_create_cb;
 		break;
 
 	default:
 		cbfunc = NULL;
 		break;
 	}
 	if (strchr(fsname, '@') ||
 	    strchr(fsname, '%'))
 		return (SET_ERROR(EINVAL));
 
 	zct.zct_props = nvprops;
 
 	if (cbfunc == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (type == DMU_OST_ZVOL) {
 		uint64_t volsize, volblocksize;
 
 		if (nvprops == NULL)
 			return (SET_ERROR(EINVAL));
 		if (nvlist_lookup_uint64(nvprops,
 		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
 			return (SET_ERROR(EINVAL));
 
 		if ((error = nvlist_lookup_uint64(nvprops,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 		    &volblocksize)) != 0 && error != ENOENT)
 			return (SET_ERROR(EINVAL));
 
 		if (error != 0)
 			volblocksize = zfs_prop_default_numeric(
 			    ZFS_PROP_VOLBLOCKSIZE);
 
 		if ((error = zvol_check_volblocksize(
 		    volblocksize)) != 0 ||
 		    (error = zvol_check_volsize(volsize,
 		    volblocksize)) != 0)
 			return (error);
 	} else if (type == DMU_OST_ZFS) {
 		int error;
 
 		/*
 		 * We have to have normalization and
 		 * case-folding flags correct when we do the
 		 * file system creation, so go figure them out
 		 * now.
 		 */
 		VERIFY(nvlist_alloc(&zct.zct_zplprops,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		error = zfs_fill_zplprops(fsname, nvprops,
 		    zct.zct_zplprops, &is_insensitive);
 		if (error != 0) {
 			nvlist_free(zct.zct_zplprops);
 			return (error);
 		}
 	}
 
 	error = dmu_objset_create(fsname, type,
 	    is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
 	nvlist_free(zct.zct_zplprops);
 
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
 		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
 		    nvprops, outnvl);
 		if (error != 0)
 			(void) dsl_destroy_head(fsname);
 	}
 #ifdef __FreeBSD__
 	if (error == 0 && type == DMU_OST_ZVOL)
 		zvol_create_minors(fsname);
 #endif
 	return (error);
 }
 
 /*
  * innvl: {
  *     "origin" -> name of origin snapshot
  *     (optional) "props" -> { prop -> value }
  * }
  *
  * outnvl: propname -> error code (int32)
  */
 static int
 zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error = 0;
 	nvlist_t *nvprops = NULL;
 	char *origin_name;
 
 	if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0)
 		return (SET_ERROR(EINVAL));
 	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
 
 	if (strchr(fsname, '@') ||
 	    strchr(fsname, '%'))
 		return (SET_ERROR(EINVAL));
 
 	if (dataset_namecheck(origin_name, NULL, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 	error = dmu_objset_clone(fsname, origin_name);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
 		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
 		    nvprops, outnvl);
 		if (error != 0)
 			(void) dsl_destroy_head(fsname);
 	}
 #ifdef __FreeBSD__
 	if (error == 0)
 		zvol_create_minors(fsname);
 #endif
 	return (error);
 }
 
 /*
  * innvl: {
  *     "snaps" -> { snapshot1, snapshot2 }
  *     (optional) "props" -> { prop -> value (string) }
  * }
  *
  * outnvl: snapshot -> error code (int32)
  */
 static int
 zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	nvlist_t *snaps;
 	nvlist_t *props = NULL;
 	int error, poollen;
 	nvpair_t *pair;
 
 	(void) nvlist_lookup_nvlist(innvl, "props", &props);
 	if ((error = zfs_check_userprops(poolname, props)) != 0)
 		return (error);
 
 	if (!nvlist_empty(props) &&
 	    zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
 		return (SET_ERROR(ENOTSUP));
 
 	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
 		return (SET_ERROR(EINVAL));
 	poollen = strlen(poolname);
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		const char *name = nvpair_name(pair);
 		const char *cp = strchr(name, '@');
 
 		/*
 		 * The snap name must contain an @, and the part after it must
 		 * contain only valid characters.
 		 */
 		if (cp == NULL ||
 		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * The snap must be in the specified pool.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '@'))
 			return (SET_ERROR(EXDEV));
 
 		/* This must be the only snap of this fs. */
 		for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
 		    pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
 			if (strncmp(name, nvpair_name(pair2), cp - name + 1)
 			    == 0) {
 				return (SET_ERROR(EXDEV));
 			}
 		}
 	}
 
 	error = dsl_dataset_snapshot(snaps, props, outnvl);
 	return (error);
 }
 
 /*
  * innvl: "message" -> string
  */
 /* ARGSUSED */
 static int
 zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	char *message;
 	spa_t *spa;
 	int error;
 	char *poolname;
 
 	/*
 	 * The poolname in the ioctl is not set, we get it from the TSD,
 	 * which was set at the end of the last successful ioctl that allows
 	 * logging.  The secpolicy func already checked that it is set.
 	 * Only one log ioctl is allowed after each successful ioctl, so
 	 * we clear the TSD here.
 	 */
 	poolname = tsd_get(zfs_allow_log_key);
 	(void) tsd_set(zfs_allow_log_key, NULL);
 	error = spa_open(poolname, &spa, FTAG);
 	strfree(poolname);
 	if (error != 0)
 		return (error);
 
 	if (nvlist_lookup_string(innvl, "message", &message) != 0)  {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	error = spa_history_log(spa, message);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 /*
  * The dp_config_rwlock must not be held when calling this, because the
  * unmount may need to write out data.
  *
  * This function is best-effort.  Callers must deal gracefully if it
  * remains mounted (or is remounted after this call).
  *
  * Returns 0 if the argument is not a snapshot, or it is not currently a
  * filesystem, or we were able to unmount it.  Returns error code otherwise.
  */
 int
 zfs_unmount_snap(const char *snapname)
 {
 	vfs_t *vfsp;
 	zfsvfs_t *zfsvfs;
 	int err;
 
 	if (strchr(snapname, '@') == NULL)
 		return (0);
 
 	vfsp = zfs_get_vfs(snapname);
 	if (vfsp == NULL)
 		return (0);
 
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
 
 	err = vn_vfswlock(vfsp->vfs_vnodecovered);
 	VFS_RELE(vfsp);
 	if (err != 0)
 		return (SET_ERROR(err));
 
 	/*
 	 * Always force the unmount for snapshots.
 	 */
 
 #ifdef illumos
 	(void) dounmount(vfsp, MS_FORCE, kcred);
 #else
 	mtx_lock(&Giant);	/* dounmount() */
 	(void) dounmount(vfsp, MS_FORCE, curthread);
 	mtx_unlock(&Giant);	/* dounmount() */
 #endif
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_unmount_snap_cb(const char *snapname, void *arg)
 {
 	return (zfs_unmount_snap(snapname));
 }
 
 /*
  * When a clone is destroyed, its origin may also need to be destroyed,
  * in which case it must be unmounted.  This routine will do that unmount
  * if necessary.
  */
 void
 zfs_destroy_unmount_origin(const char *fsname)
 {
 	int error;
 	objset_t *os;
 	dsl_dataset_t *ds;
 
 	error = dmu_objset_hold(fsname, FTAG, &os);
 	if (error != 0)
 		return;
 	ds = dmu_objset_ds(os);
 	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
 		char originname[MAXNAMELEN];
 		dsl_dataset_name(ds->ds_prev, originname);
 		dmu_objset_rele(os, FTAG);
 		(void) zfs_unmount_snap(originname);
 	} else {
 		dmu_objset_rele(os, FTAG);
 	}
 }
 
 /*
  * innvl: {
  *     "snaps" -> { snapshot1, snapshot2 }
  *     (optional boolean) "defer"
  * }
  *
  * outnvl: snapshot -> error code (int32)
  *
  */
 /* ARGSUSED */
 static int
 zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error, poollen;
 	nvlist_t *snaps;
 	nvpair_t *pair;
 	boolean_t defer;
 
 	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
 		return (SET_ERROR(EINVAL));
 	defer = nvlist_exists(innvl, "defer");
 
 	poollen = strlen(poolname);
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		const char *name = nvpair_name(pair);
 
 		/*
 		 * The snap must be in the specified pool to prevent the
 		 * invalid removal of zvol minors below.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '@'))
 			return (SET_ERROR(EXDEV));
 
 		error = zfs_unmount_snap(name);
 		if (error != 0)
 			return (error);
 #if defined(__FreeBSD__)
 		zvol_remove_minors(name);
 #endif
 	}
 
 	return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
 }
 
 /*
  * Create bookmarks.  Bookmark names are of the form <fs>#<bmark>.
  * All bookmarks must be in the same pool.
  *
  * innvl: {
  *     bookmark1 -> snapshot1, bookmark2 -> snapshot2
  * }
  *
  * outnvl: bookmark -> error code (int32)
  *
  */
 /* ARGSUSED */
 static int
 zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
 		char *snap_name;
 
 		/*
 		 * Verify the snapshot argument.
 		 */
 		if (nvpair_value_string(pair, &snap_name) != 0)
 			return (SET_ERROR(EINVAL));
 
 
 		/* Verify that the keys (bookmarks) are unique */
 		for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
 		    pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
 			if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
 				return (SET_ERROR(EINVAL));
 		}
 	}
 
 	return (dsl_bookmark_create(innvl, outnvl));
 }
 
 /*
  * innvl: {
  *     property 1, property 2, ...
  * }
  *
  * outnvl: {
  *     bookmark name 1 -> { property 1, property 2, ... },
  *     bookmark name 2 -> { property 1, property 2, ... }
  * }
  *
  */
 static int
 zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	return (dsl_get_bookmarks(fsname, innvl, outnvl));
 }
 
 /*
  * innvl: {
  *     bookmark name 1, bookmark name 2
  * }
  *
  * outnvl: bookmark -> error code (int32)
  *
  */
 static int
 zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
     nvlist_t *outnvl)
 {
 	int error, poollen;
 
 	poollen = strlen(poolname);
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
 		const char *name = nvpair_name(pair);
 		const char *cp = strchr(name, '#');
 
 		/*
 		 * The bookmark name must contain an #, and the part after it
 		 * must contain only valid characters.
 		 */
 		if (cp == NULL ||
 		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * The bookmark must be in the specified pool.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '#'))
 			return (SET_ERROR(EXDEV));
 	}
 
 	error = dsl_bookmark_destroy(innvl, outnvl);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of dataset to destroy
  * zc_objset_type	type of objset
  * zc_defer_destroy	mark for deferred destroy
  *
  * outputs:		none
  */
 static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
 	int err;
 
 	if (zc->zc_objset_type == DMU_OST_ZFS) {
 		err = zfs_unmount_snap(zc->zc_name);
 		if (err != 0)
 			return (err);
 	}
 
 	if (strchr(zc->zc_name, '@'))
 		err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
 	else
 		err = dsl_destroy_head(zc->zc_name);
 	if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
 #ifdef __FreeBSD__
 		zvol_remove_minors(zc->zc_name);
 #else
 		(void) zvol_remove_minor(zc->zc_name);
 #endif
 	return (err);
 }
 
 /*
  * fsname is name of dataset to rollback (to most recent snapshot)
  *
  * innvl is not used.
  *
  * outnvl: "target" -> name of most recent snapshot
  * }
  */
 /* ARGSUSED */
 static int
 zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl)
 {
 	zfsvfs_t *zfsvfs;
 	int error;
 
 	if (getzfsvfs(fsname, &zfsvfs) == 0) {
 		error = zfs_suspend_fs(zfsvfs);
 		if (error == 0) {
 			int resume_err;
 
 			error = dsl_dataset_rollback(fsname, zfsvfs, outnvl);
 			resume_err = zfs_resume_fs(zfsvfs, fsname);
 			error = error ? error : resume_err;
 		}
 		VFS_RELE(zfsvfs->z_vfs);
 	} else {
 		error = dsl_dataset_rollback(fsname, NULL, outnvl);
 	}
 	return (error);
 }
 
 static int
 recursive_unmount(const char *fsname, void *arg)
 {
 	const char *snapname = arg;
 	char fullname[MAXNAMELEN];
 
 	(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
 	return (zfs_unmount_snap(fullname));
 }
 
 /*
  * inputs:
  * zc_name	old name of dataset
  * zc_value	new name of dataset
  * zc_cookie	recursive flag (only valid for snapshots)
  *
  * outputs:	none
  */
 static int
 zfs_ioc_rename(zfs_cmd_t *zc)
 {
 	boolean_t recursive = zc->zc_cookie & 1;
 #ifdef __FreeBSD__
 	boolean_t allow_mounted = zc->zc_cookie & 2;
 #endif
 	char *at;
 
 	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '%'))
 		return (SET_ERROR(EINVAL));
 
 	at = strchr(zc->zc_name, '@');
 	if (at != NULL) {
 		/* snaps must be in same fs */
 		int error;
 
 		if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
 			return (SET_ERROR(EXDEV));
 		*at = '\0';
 #ifdef illumos
 		if (zc->zc_objset_type == DMU_OST_ZFS) {
 #else
 		if (zc->zc_objset_type == DMU_OST_ZFS && allow_mounted) {
 #endif
 			error = dmu_objset_find(zc->zc_name,
 			    recursive_unmount, at + 1,
 			    recursive ? DS_FIND_CHILDREN : 0);
 			if (error != 0) {
 				*at = '@';
 				return (error);
 			}
 		}
 		error = dsl_dataset_rename_snapshot(zc->zc_name,
 		    at + 1, strchr(zc->zc_value, '@') + 1, recursive);
 		*at = '@';
 
 		return (error);
 	} else {
 #ifdef illumos
 		if (zc->zc_objset_type == DMU_OST_ZVOL)
 			(void) zvol_remove_minor(zc->zc_name);
 #endif
 		return (dsl_dir_rename(zc->zc_name, zc->zc_value));
 	}
 }
 
 static int
 zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
 {
 	const char *propname = nvpair_name(pair);
 	boolean_t issnap = (strchr(dsname, '@') != NULL);
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	uint64_t intval;
 	int err;
 
 	if (prop == ZPROP_INVAL) {
 		if (zfs_prop_user(propname)) {
 			if (err = zfs_secpolicy_write_perms(dsname,
 			    ZFS_DELEG_PERM_USERPROP, cr))
 				return (err);
 			return (0);
 		}
 
 		if (!issnap && zfs_prop_userquota(propname)) {
 			const char *perm = NULL;
 			const char *uq_prefix =
 			    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
 			const char *gq_prefix =
 			    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
 
 			if (strncmp(propname, uq_prefix,
 			    strlen(uq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_USERQUOTA;
 			} else if (strncmp(propname, gq_prefix,
 			    strlen(gq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_GROUPQUOTA;
 			} else {
 				/* USERUSED and GROUPUSED are read-only */
 				return (SET_ERROR(EINVAL));
 			}
 
 			if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
 				return (err);
 			return (0);
 		}
 
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (issnap)
 		return (SET_ERROR(EINVAL));
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		/*
 		 * dsl_prop_get_all_impl() returns properties in this
 		 * format.
 		 */
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
 		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &pair) == 0);
 	}
 
 	/*
 	 * Check that this value is valid for this pool version
 	 */
 	switch (prop) {
 	case ZFS_PROP_COMPRESSION:
 		/*
 		 * If the user specified gzip compression, make sure
 		 * the SPA supports it. We ignore any errors here since
 		 * we'll catch them later.
 		 */
 		if (nvpair_value_uint64(pair, &intval) == 0) {
 			if (intval >= ZIO_COMPRESS_GZIP_1 &&
 			    intval <= ZIO_COMPRESS_GZIP_9 &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_GZIP_COMPRESSION)) {
 				return (SET_ERROR(ENOTSUP));
 			}
 
 			if (intval == ZIO_COMPRESS_ZLE &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_ZLE_COMPRESSION))
 				return (SET_ERROR(ENOTSUP));
 
 			if (intval == ZIO_COMPRESS_LZ4) {
 				spa_t *spa;
 
 				if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 					return (err);
 
 				if (!spa_feature_is_enabled(spa,
 				    SPA_FEATURE_LZ4_COMPRESS)) {
 					spa_close(spa, FTAG);
 					return (SET_ERROR(ENOTSUP));
 				}
 				spa_close(spa, FTAG);
 			}
 
 			/*
 			 * If this is a bootable dataset then
 			 * verify that the compression algorithm
 			 * is supported for booting. We must return
 			 * something other than ENOTSUP since it
 			 * implies a downrev pool version.
 			 */
 			if (zfs_is_bootfs(dsname) &&
 			    !BOOTFS_COMPRESS_VALID(intval)) {
 				return (SET_ERROR(ERANGE));
 			}
 		}
 		break;
 
 	case ZFS_PROP_COPIES:
 		if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
 			return (SET_ERROR(ENOTSUP));
 		break;
 
 	case ZFS_PROP_DEDUP:
 		if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
 			return (SET_ERROR(ENOTSUP));
 		break;
 
 	case ZFS_PROP_RECORDSIZE:
 		/* Record sizes above 128k need the feature to be enabled */
 		if (nvpair_value_uint64(pair, &intval) == 0 &&
 		    intval > SPA_OLD_MAXBLOCKSIZE) {
 			spa_t *spa;
 
 			/*
 			 * If this is a bootable dataset then
 			 * the we don't allow large (>128K) blocks,
 			 * because GRUB doesn't support them.
 			 */
 			if (zfs_is_bootfs(dsname) &&
 			    intval > SPA_OLD_MAXBLOCKSIZE) {
 				return (SET_ERROR(EDOM));
 			}
 
 			/*
 			 * We don't allow setting the property above 1MB,
 			 * unless the tunable has been changed.
 			 */
 			if (intval > zfs_max_recordsize ||
 			    intval > SPA_MAXBLOCKSIZE)
 				return (SET_ERROR(EDOM));
 
 			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 				return (err);
 
 			if (!spa_feature_is_enabled(spa,
 			    SPA_FEATURE_LARGE_BLOCKS)) {
 				spa_close(spa, FTAG);
 				return (SET_ERROR(ENOTSUP));
 			}
 			spa_close(spa, FTAG);
 		}
 		break;
 
 	case ZFS_PROP_SHARESMB:
 		if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
 			return (SET_ERROR(ENOTSUP));
 		break;
 
 	case ZFS_PROP_ACLINHERIT:
 		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
 		    nvpair_value_uint64(pair, &intval) == 0) {
 			if (intval == ZFS_ACL_PASSTHROUGH_X &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_PASSTHROUGH_X))
 				return (SET_ERROR(ENOTSUP));
 		}
 		break;
 	}
 
 	return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
 }
 
 /*
  * Checks for a race condition to make sure we don't increment a feature flag
  * multiple times.
  */
 static int
 zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_feature_t *featurep = arg;
 
 	if (!spa_feature_is_active(spa, *featurep))
 		return (0);
 	else
 		return (SET_ERROR(EBUSY));
 }
 
 /*
  * The callback invoked on feature activation in the sync task caused by
  * zfs_prop_activate_feature.
  */
 static void
 zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_feature_t *featurep = arg;
 
 	spa_feature_incr(spa, *featurep, tx);
 }
 
 /*
  * Activates a feature on a pool in response to a property setting. This
  * creates a new sync task which modifies the pool to reflect the feature
  * as being active.
  */
 static int
 zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
 {
 	int err;
 
 	/* EBUSY here indicates that the feature is already active */
 	err = dsl_sync_task(spa_name(spa),
 	    zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
 	    &feature, 2, ZFS_SPACE_CHECK_RESERVED);
 
 	if (err != 0 && err != EBUSY)
 		return (err);
 	else
 		return (0);
 }
 
 /*
  * Removes properties from the given props list that fail permission checks
  * needed to clear them and to restore them in case of a receive error. For each
  * property, make sure we have both set and inherit permissions.
  *
  * Returns the first error encountered if any permission checks fail. If the
  * caller provides a non-NULL errlist, it also gives the complete list of names
  * of all the properties that failed a permission check along with the
  * corresponding error numbers. The caller is responsible for freeing the
  * returned errlist.
  *
  * If every property checks out successfully, zero is returned and the list
  * pointed at by errlist is NULL.
  */
 static int
 zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
 {
 	zfs_cmd_t *zc;
 	nvpair_t *pair, *next_pair;
 	nvlist_t *errors;
 	int err, rv = 0;
 
 	if (props == NULL)
 		return (0);
 
 	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
 	(void) strcpy(zc->zc_name, dataset);
 	pair = nvlist_next_nvpair(props, NULL);
 	while (pair != NULL) {
 		next_pair = nvlist_next_nvpair(props, pair);
 
 		(void) strcpy(zc->zc_value, nvpair_name(pair));
 		if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
 		    (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
 			VERIFY(nvlist_remove_nvpair(props, pair) == 0);
 			VERIFY(nvlist_add_int32(errors,
 			    zc->zc_value, err) == 0);
 		}
 		pair = next_pair;
 	}
 	kmem_free(zc, sizeof (zfs_cmd_t));
 
 	if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
 		nvlist_free(errors);
 		errors = NULL;
 	} else {
 		VERIFY(nvpair_value_int32(pair, &rv) == 0);
 	}
 
 	if (errlist == NULL)
 		nvlist_free(errors);
 	else
 		*errlist = errors;
 
 	return (rv);
 }
 
 static boolean_t
 propval_equals(nvpair_t *p1, nvpair_t *p2)
 {
 	if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
 		/* dsl_prop_get_all_impl() format */
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
 		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &p1) == 0);
 	}
 
 	if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
 		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &p2) == 0);
 	}
 
 	if (nvpair_type(p1) != nvpair_type(p2))
 		return (B_FALSE);
 
 	if (nvpair_type(p1) == DATA_TYPE_STRING) {
 		char *valstr1, *valstr2;
 
 		VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
 		VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
 		return (strcmp(valstr1, valstr2) == 0);
 	} else {
 		uint64_t intval1, intval2;
 
 		VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
 		VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
 		return (intval1 == intval2);
 	}
 }
 
 /*
  * Remove properties from props if they are not going to change (as determined
  * by comparison with origprops). Remove them from origprops as well, since we
  * do not need to clear or restore properties that won't change.
  */
 static void
 props_reduce(nvlist_t *props, nvlist_t *origprops)
 {
 	nvpair_t *pair, *next_pair;
 
 	if (origprops == NULL)
 		return; /* all props need to be received */
 
 	pair = nvlist_next_nvpair(props, NULL);
 	while (pair != NULL) {
 		const char *propname = nvpair_name(pair);
 		nvpair_t *match;
 
 		next_pair = nvlist_next_nvpair(props, pair);
 
 		if ((nvlist_lookup_nvpair(origprops, propname,
 		    &match) != 0) || !propval_equals(pair, match))
 			goto next; /* need to set received value */
 
 		/* don't clear the existing received value */
 		(void) nvlist_remove_nvpair(origprops, match);
 		/* don't bother receiving the property */
 		(void) nvlist_remove_nvpair(props, pair);
 next:
 		pair = next_pair;
 	}
 }
 
 #ifdef	DEBUG
 static boolean_t zfs_ioc_recv_inject_err;
 #endif
 
 /*
  * inputs:
  * zc_name		name of containing filesystem
  * zc_nvlist_src{_size}	nvlist of properties to apply
  * zc_value		name of snapshot to create
  * zc_string		name of clone origin (if DRR_FLAG_CLONE)
  * zc_cookie		file descriptor to recv from
  * zc_begin_record	the BEGIN record of the stream (not byteswapped)
  * zc_guid		force flag
  * zc_cleanup_fd	cleanup-on-exit file descriptor
  * zc_action_handle	handle for this guid/ds mapping (or zero on first call)
  *
  * outputs:
  * zc_cookie		number of bytes read
  * zc_nvlist_dst{_size} error for each unapplied received property
  * zc_obj		zprop_errflags_t
  * zc_action_handle	handle for this guid/ds mapping
  */
 static int
 zfs_ioc_recv(zfs_cmd_t *zc)
 {
 	file_t *fp;
 	dmu_recv_cookie_t drc;
 	boolean_t force = (boolean_t)zc->zc_guid;
 	int fd;
 	int error = 0;
 	int props_error = 0;
 	nvlist_t *errors;
 	offset_t off;
 	nvlist_t *props = NULL; /* sent properties */
 	nvlist_t *origprops = NULL; /* existing properties */
 	char *origin = NULL;
 	char *tosnap;
 	char tofs[ZFS_MAXNAMELEN];
 	cap_rights_t rights;
 	boolean_t first_recvd_props = B_FALSE;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL ||
 	    strchr(zc->zc_value, '%'))
 		return (SET_ERROR(EINVAL));
 
 	(void) strcpy(tofs, zc->zc_value);
 	tosnap = strchr(tofs, '@');
 	*tosnap++ = '\0';
 
 	if (zc->zc_nvlist_src != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props)) != 0)
 		return (error);
 
 	fd = zc->zc_cookie;
 	fp = getf(fd, cap_rights_init(&rights, CAP_PREAD));
 	if (fp == NULL) {
 		nvlist_free(props);
 		return (SET_ERROR(EBADF));
 	}
 
 	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	if (zc->zc_string[0])
 		origin = zc->zc_string;
 
 	error = dmu_recv_begin(tofs, tosnap,
 	    &zc->zc_begin_record, force, origin, &drc);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Set properties before we receive the stream so that they are applied
 	 * to the new data. Note that we must call dmu_recv_stream() if
 	 * dmu_recv_begin() succeeds.
 	 */
 	if (props != NULL && !drc.drc_newfs) {
 		if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
 		    SPA_VERSION_RECVD_PROPS &&
 		    !dsl_prop_get_hasrecvd(tofs))
 			first_recvd_props = B_TRUE;
 
 		/*
 		 * If new received properties are supplied, they are to
 		 * completely replace the existing received properties, so stash
 		 * away the existing ones.
 		 */
 		if (dsl_prop_get_received(tofs, &origprops) == 0) {
 			nvlist_t *errlist = NULL;
 			/*
 			 * Don't bother writing a property if its value won't
 			 * change (and avoid the unnecessary security checks).
 			 *
 			 * The first receive after SPA_VERSION_RECVD_PROPS is a
 			 * special case where we blow away all local properties
 			 * regardless.
 			 */
 			if (!first_recvd_props)
 				props_reduce(props, origprops);
 			if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
 				(void) nvlist_merge(errors, errlist, 0);
 			nvlist_free(errlist);
 
 			if (clear_received_props(tofs, origprops,
 			    first_recvd_props ? NULL : props) != 0)
 				zc->zc_obj |= ZPROP_ERR_NOCLEAR;
 		} else {
 			zc->zc_obj |= ZPROP_ERR_NOCLEAR;
 		}
 	}
 
 	if (props != NULL) {
 		props_error = dsl_prop_set_hasrecvd(tofs);
 
 		if (props_error == 0) {
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
 			    props, errors);
 		}
 	}
 
 	if (zc->zc_nvlist_dst_size != 0 &&
 	    (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
 	    put_nvlist(zc, errors) != 0)) {
 		/*
 		 * Caller made zc->zc_nvlist_dst less than the minimum expected
 		 * size or supplied an invalid address.
 		 */
 		props_error = SET_ERROR(EINVAL);
 	}
 
 	off = fp->f_offset;
 	error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
 	    &zc->zc_action_handle);
 
 	if (error == 0) {
 		zfsvfs_t *zfsvfs = NULL;
 
 		if (getzfsvfs(tofs, &zfsvfs) == 0) {
 			/* online recv */
 			int end_err;
 
 			error = zfs_suspend_fs(zfsvfs);
 			/*
 			 * If the suspend fails, then the recv_end will
 			 * likely also fail, and clean up after itself.
 			 */
 			end_err = dmu_recv_end(&drc, zfsvfs);
 			if (error == 0)
 				error = zfs_resume_fs(zfsvfs, tofs);
 			error = error ? error : end_err;
 			VFS_RELE(zfsvfs->z_vfs);
 		} else {
 			error = dmu_recv_end(&drc, NULL);
 		}
 	}
 
 	zc->zc_cookie = off - fp->f_offset;
 	if (off >= 0 && off <= MAXOFFSET_T)
 		fp->f_offset = off;
 
 #ifdef	DEBUG
 	if (zfs_ioc_recv_inject_err) {
 		zfs_ioc_recv_inject_err = B_FALSE;
 		error = 1;
 	}
 #endif
 
 #ifdef __FreeBSD__
 	if (error == 0)
 		zvol_create_minors(tofs);
 #endif
 
 	/*
 	 * On error, restore the original props.
 	 */
 	if (error != 0 && props != NULL && !drc.drc_newfs) {
 		if (clear_received_props(tofs, props, NULL) != 0) {
 			/*
 			 * We failed to clear the received properties.
 			 * Since we may have left a $recvd value on the
 			 * system, we can't clear the $hasrecvd flag.
 			 */
 			zc->zc_obj |= ZPROP_ERR_NORESTORE;
 		} else if (first_recvd_props) {
 			dsl_prop_unset_hasrecvd(tofs);
 		}
 
 		if (origprops == NULL && !drc.drc_newfs) {
 			/* We failed to stash the original properties. */
 			zc->zc_obj |= ZPROP_ERR_NORESTORE;
 		}
 
 		/*
 		 * dsl_props_set() will not convert RECEIVED to LOCAL on or
 		 * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
 		 * explictly if we're restoring local properties cleared in the
 		 * first new-style receive.
 		 */
 		if (origprops != NULL &&
 		    zfs_set_prop_nvlist(tofs, (first_recvd_props ?
 		    ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
 		    origprops, NULL) != 0) {
 			/*
 			 * We stashed the original properties but failed to
 			 * restore them.
 			 */
 			zc->zc_obj |= ZPROP_ERR_NORESTORE;
 		}
 	}
 out:
 	nvlist_free(props);
 	nvlist_free(origprops);
 	nvlist_free(errors);
 	releasef(fd);
 
 	if (error == 0)
 		error = props_error;
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name	name of snapshot to send
  * zc_cookie	file descriptor to send stream to
  * zc_obj	fromorigin flag (mutually exclusive with zc_fromobj)
  * zc_sendobj	objsetid of snapshot to send
  * zc_fromobj	objsetid of incremental fromsnap (may be zero)
  * zc_guid	if set, estimate size of stream only.  zc_cookie is ignored.
  *		output size in zc_objset_type.
  * zc_flags	lzc_send_flags
  *
  * outputs:
  * zc_objset_type	estimated size, if zc_guid is set
  */
 static int
 zfs_ioc_send(zfs_cmd_t *zc)
 {
 	int error;
 	offset_t off;
 	boolean_t estimate = (zc->zc_guid != 0);
 	boolean_t embedok = (zc->zc_flags & 0x1);
 	boolean_t large_block_ok = (zc->zc_flags & 0x2);
 
 	if (zc->zc_obj != 0) {
 		dsl_pool_t *dp;
 		dsl_dataset_t *tosnap;
 
 		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 		if (error != 0)
 			return (error);
 
 		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
 		if (error != 0) {
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
 		if (dsl_dir_is_clone(tosnap->ds_dir))
-			zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj;
+			zc->zc_fromobj =
+			    dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	}
 
 	if (estimate) {
 		dsl_pool_t *dp;
 		dsl_dataset_t *tosnap;
 		dsl_dataset_t *fromsnap = NULL;
 
 		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 		if (error != 0)
 			return (error);
 
 		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
 		if (error != 0) {
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
 		if (zc->zc_fromobj != 0) {
 			error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
 			    FTAG, &fromsnap);
 			if (error != 0) {
 				dsl_dataset_rele(tosnap, FTAG);
 				dsl_pool_rele(dp, FTAG);
 				return (error);
 			}
 		}
 
 		error = dmu_send_estimate(tosnap, fromsnap,
 		    &zc->zc_objset_type);
 
 		if (fromsnap != NULL)
 			dsl_dataset_rele(fromsnap, FTAG);
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	} else {
 		file_t *fp;
 		cap_rights_t rights;
 
 		fp = getf(zc->zc_cookie,
 		    cap_rights_init(&rights, CAP_WRITE));
 		if (fp == NULL)
 			return (SET_ERROR(EBADF));
 
 		off = fp->f_offset;
 		error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
 		    zc->zc_fromobj, embedok, large_block_ok,
 #ifdef illumos
 		    zc->zc_cookie, fp->f_vnode, &off);
 #else
 		    zc->zc_cookie, fp, &off);
 #endif
 
 		if (off >= 0 && off <= MAXOFFSET_T)
 			fp->f_offset = off;
 		releasef(zc->zc_cookie);
 	}
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name	name of snapshot on which to report progress
  * zc_cookie	file descriptor of send stream
  *
  * outputs:
  * zc_cookie	number of bytes written in send stream thus far
  */
 static int
 zfs_ioc_send_progress(zfs_cmd_t *zc)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	dmu_sendarg_t *dsp = NULL;
 	int error;
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	mutex_enter(&ds->ds_sendstream_lock);
 
 	/*
 	 * Iterate over all the send streams currently active on this dataset.
 	 * If there's one which matches the specified file descriptor _and_ the
 	 * stream was started by the current process, return the progress of
 	 * that stream.
 	 */
 	for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
 	    dsp = list_next(&ds->ds_sendstreams, dsp)) {
 		if (dsp->dsa_outfd == zc->zc_cookie &&
 		    dsp->dsa_proc == curproc)
 			break;
 	}
 
 	if (dsp != NULL)
 		zc->zc_cookie = *(dsp->dsa_off);
 	else
 		error = SET_ERROR(ENOENT);
 
 	mutex_exit(&ds->ds_sendstream_lock);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_inject_fault(zfs_cmd_t *zc)
 {
 	int id, error;
 
 	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
 	    &zc->zc_inject_record);
 
 	if (error == 0)
 		zc->zc_guid = (uint64_t)id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear_fault(zfs_cmd_t *zc)
 {
 	return (zio_clear_fault((int)zc->zc_guid));
 }
 
 static int
 zfs_ioc_inject_list_next(zfs_cmd_t *zc)
 {
 	int id = (int)zc->zc_guid;
 	int error;
 
 	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
 	    &zc->zc_inject_record);
 
 	zc->zc_guid = id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_error_log(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	size_t count = (size_t)zc->zc_nvlist_dst_size;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
 	    &count);
 	if (error == 0)
 		zc->zc_nvlist_dst_size = count;
 	else
 		zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	vdev_t *vd;
 	int error;
 
 	/*
 	 * On zpool clear we also fix up missing slogs
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_lookup(zc->zc_name);
 	if (spa == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EIO));
 	}
 	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
 		/* we need to let spa_open/spa_load clear the chains */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	}
 	spa->spa_last_open_failed = 0;
 	mutex_exit(&spa_namespace_lock);
 
 	if (zc->zc_cookie & ZPOOL_NO_REWIND) {
 		error = spa_open(zc->zc_name, &spa, FTAG);
 	} else {
 		nvlist_t *policy;
 		nvlist_t *config = NULL;
 
 		if (zc->zc_nvlist_src == 0)
 			return (SET_ERROR(EINVAL));
 
 		if ((error = get_nvlist(zc->zc_nvlist_src,
 		    zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
 			error = spa_open_rewind(zc->zc_name, &spa, FTAG,
 			    policy, &config);
 			if (config != NULL) {
 				int err;
 
 				if ((err = put_nvlist(zc, config)) != 0)
 					error = err;
 				nvlist_free(config);
 			}
 			nvlist_free(policy);
 		}
 	}
 
 	if (error != 0)
 		return (error);
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if (zc->zc_guid == 0) {
 		vd = NULL;
 	} else {
 		vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
 		if (vd == NULL) {
 			(void) spa_vdev_state_exit(spa, NULL, ENODEV);
 			spa_close(spa, FTAG);
 			return (SET_ERROR(ENODEV));
 		}
 	}
 
 	vdev_clear(spa, vd);
 
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	/*
 	 * Resume any suspended I/Os.
 	 */
 	if (zio_resume(spa) != 0)
 		error = SET_ERROR(EIO);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_reopen(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	/*
 	 * If a resilver is already in progress then set the
 	 * spa_scrub_reopen flag to B_TRUE so that we don't restart
 	 * the scan as a side effect of the reopen. Otherwise, let
 	 * vdev_open() decided if a resilver is required.
 	 */
 	spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool);
 	vdev_reopen(spa->spa_root_vdev);
 	spa->spa_scrub_reopen = B_FALSE;
 
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 	spa_close(spa, FTAG);
 	return (0);
 }
 /*
  * inputs:
  * zc_name	name of filesystem
  * zc_value	name of origin snapshot
  *
  * outputs:
  * zc_string	name of conflicting snapshot, if there is one
  */
 static int
 zfs_ioc_promote(zfs_cmd_t *zc)
 {
 	char *cp;
 
 	/*
 	 * We don't need to unmount *all* the origin fs's snapshots, but
 	 * it's easier.
 	 */
 	cp = strchr(zc->zc_value, '@');
 	if (cp)
 		*cp = '\0';
 	(void) dmu_objset_find(zc->zc_value,
 	    zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
 	return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
 }
 
 /*
  * Retrieve a single {user|group}{used|quota}@... property.
  *
  * inputs:
  * zc_name	name of filesystem
  * zc_objset_type zfs_userquota_prop_t
  * zc_value	domain name (eg. "S-1-234-567-89")
  * zc_guid	RID/UID/GID
  *
  * outputs:
  * zc_cookie	property value
  */
 static int
 zfs_ioc_userspace_one(zfs_cmd_t *zc)
 {
 	zfsvfs_t *zfsvfs;
 	int error;
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
 		return (SET_ERROR(EINVAL));
 
 	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_userspace_one(zfsvfs,
 	    zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
 	zfsvfs_rele(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_objset_type	zfs_userquota_prop_t
  * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
  *
  * outputs:
  * zc_nvlist_dst[_size]	data buffer (array of zfs_useracct_t)
  * zc_cookie	zap cursor
  */
 static int
 zfs_ioc_userspace_many(zfs_cmd_t *zc)
 {
 	zfsvfs_t *zfsvfs;
 	int bufsize = zc->zc_nvlist_dst_size;
 
 	if (bufsize <= 0)
 		return (SET_ERROR(ENOMEM));
 
 	int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	void *buf = kmem_alloc(bufsize, KM_SLEEP);
 
 	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
 	    buf, &zc->zc_nvlist_dst_size);
 
 	if (error == 0) {
 		error = ddi_copyout(buf,
 		    (void *)(uintptr_t)zc->zc_nvlist_dst,
 		    zc->zc_nvlist_dst_size, zc->zc_iflags);
 	}
 	kmem_free(buf, bufsize);
 	zfsvfs_rele(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  *
  * outputs:
  * none
  */
 static int
 zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error = 0;
 	zfsvfs_t *zfsvfs;
 
 	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
 		if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
 			/*
 			 * If userused is not enabled, it may be because the
 			 * objset needs to be closed & reopened (to grow the
 			 * objset_phys_t).  Suspend/resume the fs will do that.
 			 */
 			error = zfs_suspend_fs(zfsvfs);
 			if (error == 0) {
 				dmu_objset_refresh_ownership(zfsvfs->z_os,
 				    zfsvfs);
 				error = zfs_resume_fs(zfsvfs, zc->zc_name);
 			}
 		}
 		if (error == 0)
 			error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
 		VFS_RELE(zfsvfs->z_vfs);
 	} else {
 		/* XXX kind of reading contents without owning */
 		error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 		if (error != 0)
 			return (error);
 
 		error = dmu_objset_userspace_upgrade(os);
 		dmu_objset_rele(os, FTAG);
 	}
 
 	return (error);
 }
 
 #ifdef sun
 /*
  * We don't want to have a hard dependency
  * against some special symbols in sharefs
  * nfs, and smbsrv.  Determine them if needed when
  * the first file system is shared.
  * Neither sharefs, nfs or smbsrv are unloadable modules.
  */
 int (*znfsexport_fs)(void *arg);
 int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t);
 int (*zsmbexport_fs)(void *arg, boolean_t add_share);
 
 int zfs_nfsshare_inited;
 int zfs_smbshare_inited;
 
 ddi_modhandle_t nfs_mod;
 ddi_modhandle_t sharefs_mod;
 ddi_modhandle_t smbsrv_mod;
 #endif	/* sun */
 kmutex_t zfs_share_lock;
 
 #ifdef sun
 static int
 zfs_init_sharefs()
 {
 	int error;
 
 	ASSERT(MUTEX_HELD(&zfs_share_lock));
 	/* Both NFS and SMB shares also require sharetab support. */
 	if (sharefs_mod == NULL && ((sharefs_mod =
 	    ddi_modopen("fs/sharefs",
 	    KRTLD_MODE_FIRST, &error)) == NULL)) {
 		return (SET_ERROR(ENOSYS));
 	}
 	if (zshare_fs == NULL && ((zshare_fs =
 	    (int (*)(enum sharefs_sys_op, share_t *, uint32_t))
 	    ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
 		return (SET_ERROR(ENOSYS));
 	}
 	return (0);
 }
 #endif	/* sun */
 
 static int
 zfs_ioc_share(zfs_cmd_t *zc)
 {
 #ifdef sun
 	int error;
 	int opcode;
 
 	switch (zc->zc_share.z_sharetype) {
 	case ZFS_SHARE_NFS:
 	case ZFS_UNSHARE_NFS:
 		if (zfs_nfsshare_inited == 0) {
 			mutex_enter(&zfs_share_lock);
 			if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
 			    KRTLD_MODE_FIRST, &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			if (znfsexport_fs == NULL &&
 			    ((znfsexport_fs = (int (*)(void *))
 			    ddi_modsym(nfs_mod,
 			    "nfs_export", &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			error = zfs_init_sharefs();
 			if (error != 0) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			zfs_nfsshare_inited = 1;
 			mutex_exit(&zfs_share_lock);
 		}
 		break;
 	case ZFS_SHARE_SMB:
 	case ZFS_UNSHARE_SMB:
 		if (zfs_smbshare_inited == 0) {
 			mutex_enter(&zfs_share_lock);
 			if (smbsrv_mod == NULL && ((smbsrv_mod =
 			    ddi_modopen("drv/smbsrv",
 			    KRTLD_MODE_FIRST, &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			if (zsmbexport_fs == NULL && ((zsmbexport_fs =
 			    (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod,
 			    "smb_server_share", &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			error = zfs_init_sharefs();
 			if (error != 0) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			zfs_smbshare_inited = 1;
 			mutex_exit(&zfs_share_lock);
 		}
 		break;
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 
 	switch (zc->zc_share.z_sharetype) {
 	case ZFS_SHARE_NFS:
 	case ZFS_UNSHARE_NFS:
 		if (error =
 		    znfsexport_fs((void *)
 		    (uintptr_t)zc->zc_share.z_exportdata))
 			return (error);
 		break;
 	case ZFS_SHARE_SMB:
 	case ZFS_UNSHARE_SMB:
 		if (error = zsmbexport_fs((void *)
 		    (uintptr_t)zc->zc_share.z_exportdata,
 		    zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
 		    B_TRUE: B_FALSE)) {
 			return (error);
 		}
 		break;
 	}
 
 	opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS ||
 	    zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
 	    SHAREFS_ADD : SHAREFS_REMOVE;
 
 	/*
 	 * Add or remove share from sharetab
 	 */
 	error = zshare_fs(opcode,
 	    (void *)(uintptr_t)zc->zc_share.z_sharedata,
 	    zc->zc_share.z_sharemax);
 
 	return (error);
 
 #else	/* !sun */
 	return (ENOSYS);
 #endif	/* !sun */
 }
 
 ace_t full_access[] = {
 	{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
 };
 
 /*
  * inputs:
  * zc_name		name of containing filesystem
  * zc_obj		object # beyond which we want next in-use object #
  *
  * outputs:
  * zc_obj		next in-use object #
  */
 static int
 zfs_ioc_next_obj(zfs_cmd_t *zc)
 {
 	objset_t *os = NULL;
 	int error;
 
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0)
 		return (error);
 
 	error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
-	    os->os_dsl_dataset->ds_phys->ds_prev_snap_txg);
+	    dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg);
 
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_value		prefix name for snapshot
  * zc_cleanup_fd	cleanup-on-exit file descriptor for calling process
  *
  * outputs:
  * zc_value		short name of new snapshot
  */
 static int
 zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
 {
 	char *snap_name;
 	char *hold_name;
 	int error;
 	minor_t minor;
 
 	error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
 	if (error != 0)
 		return (error);
 
 	snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
 	    (u_longlong_t)ddi_get_lbolt64());
 	hold_name = kmem_asprintf("%%%s", zc->zc_value);
 
 	error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
 	    hold_name);
 	if (error == 0)
 		(void) strcpy(zc->zc_value, snap_name);
 	strfree(snap_name);
 	strfree(hold_name);
 	zfs_onexit_fd_rele(zc->zc_cleanup_fd);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of "to" snapshot
  * zc_value		name of "from" snapshot
  * zc_cookie		file descriptor to write diff data on
  *
  * outputs:
  * dmu_diff_record_t's to the file descriptor
  */
 static int
 zfs_ioc_diff(zfs_cmd_t *zc)
 {
 	file_t *fp;
 	cap_rights_t rights;
 	offset_t off;
 	int error;
 
 	fp = getf(zc->zc_cookie, cap_rights_init(&rights, CAP_WRITE));
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
 	off = fp->f_offset;
 
 #ifdef illumos
 	error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
 #else
 	error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
 #endif
 
 	if (off >= 0 && off <= MAXOFFSET_T)
 		fp->f_offset = off;
 	releasef(zc->zc_cookie);
 
 	return (error);
 }
 
 #ifdef sun
 /*
  * Remove all ACL files in shares dir
  */
 static int
 zfs_smb_acl_purge(znode_t *dzp)
 {
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	int error;
 
 	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
 	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
 		    NULL, 0)) != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
 	return (error);
 }
 #endif	/* sun */
 
 static int
 zfs_ioc_smb_acl(zfs_cmd_t *zc)
 {
 #ifdef sun
 	vnode_t *vp;
 	znode_t *dzp;
 	vnode_t *resourcevp = NULL;
 	znode_t *sharedir;
 	zfsvfs_t *zfsvfs;
 	nvlist_t *nvlist;
 	char *src, *target;
 	vattr_t vattr;
 	vsecattr_t vsec;
 	int error = 0;
 
 	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
 	    NO_FOLLOW, NULL, &vp)) != 0)
 		return (error);
 
 	/* Now make sure mntpnt and dataset are ZFS */
 
 	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
 	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
 	    zc->zc_name) != 0)) {
 		VN_RELE(vp);
 		return (SET_ERROR(EINVAL));
 	}
 
 	dzp = VTOZ(vp);
 	zfsvfs = dzp->z_zfsvfs;
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Create share dir if its missing.
 	 */
 	mutex_enter(&zfsvfs->z_lock);
 	if (zfsvfs->z_shares_dir == 0) {
 		dmu_tx_t *tx;
 
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
 		    ZFS_SHARES_DIR);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error != 0) {
 			dmu_tx_abort(tx);
 		} else {
 			error = zfs_create_share_dir(zfsvfs, tx);
 			dmu_tx_commit(tx);
 		}
 		if (error != 0) {
 			mutex_exit(&zfsvfs->z_lock);
 			VN_RELE(vp);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 	mutex_exit(&zfsvfs->z_lock);
 
 	ASSERT(zfsvfs->z_shares_dir);
 	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
 		VN_RELE(vp);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	switch (zc->zc_cookie) {
 	case ZFS_SMB_ACL_ADD:
 		vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 		vattr.va_type = VREG;
 		vattr.va_mode = S_IFREG|0777;
 		vattr.va_uid = 0;
 		vattr.va_gid = 0;
 
 		vsec.vsa_mask = VSA_ACE;
 		vsec.vsa_aclentp = &full_access;
 		vsec.vsa_aclentsz = sizeof (full_access);
 		vsec.vsa_aclcnt = 1;
 
 		error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
 		    &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
 		if (resourcevp)
 			VN_RELE(resourcevp);
 		break;
 
 	case ZFS_SMB_ACL_REMOVE:
 		error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
 		    NULL, 0);
 		break;
 
 	case ZFS_SMB_ACL_RENAME:
 		if ((error = get_nvlist(zc->zc_nvlist_src,
 		    zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
 			VN_RELE(vp);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) ||
 		    nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
 		    &target)) {
 			VN_RELE(vp);
 			VN_RELE(ZTOV(sharedir));
 			ZFS_EXIT(zfsvfs);
 			nvlist_free(nvlist);
 			return (error);
 		}
 		error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
 		    kcred, NULL, 0);
 		nvlist_free(nvlist);
 		break;
 
 	case ZFS_SMB_ACL_PURGE:
 		error = zfs_smb_acl_purge(sharedir);
 		break;
 
 	default:
 		error = SET_ERROR(EINVAL);
 		break;
 	}
 
 	VN_RELE(vp);
 	VN_RELE(ZTOV(sharedir));
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 #else	/* !sun */
 	return (EOPNOTSUPP);
 #endif	/* !sun */
 }
 
 /*
  * innvl: {
  *     "holds" -> { snapname -> holdname (string), ... }
  *     (optional) "cleanup_fd" -> fd (int32)
  * }
  *
  * outnvl: {
  *     snapname -> error value (int32)
  *     ...
  * }
  */
 /* ARGSUSED */
 static int
 zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 {
 	nvlist_t *holds;
 	int cleanup_fd = -1;
 	int error;
 	minor_t minor = 0;
 
 	error = nvlist_lookup_nvlist(args, "holds", &holds);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
 		error = zfs_onexit_fd_hold(cleanup_fd, &minor);
 		if (error != 0)
 			return (error);
 	}
 
 	error = dsl_dataset_user_hold(holds, minor, errlist);
 	if (minor != 0)
 		zfs_onexit_fd_rele(cleanup_fd);
 	return (error);
 }
 
 /*
  * innvl is not used.
  *
  * outnvl: {
  *    holdname -> time added (uint64 seconds since epoch)
  *    ...
  * }
  */
 /* ARGSUSED */
 static int
 zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
 {
 	return (dsl_dataset_get_holds(snapname, outnvl));
 }
 
 /*
  * innvl: {
  *     snapname -> { holdname, ... }
  *     ...
  * }
  *
  * outnvl: {
  *     snapname -> error value (int32)
  *     ...
  * }
  */
 /* ARGSUSED */
 static int
 zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
 {
 	return (dsl_dataset_user_release(holds, errlist));
 }
 
 /*
  * inputs:
  * zc_name		name of new filesystem or snapshot
  * zc_value		full name of old snapshot
  *
  * outputs:
  * zc_cookie		space in bytes
  * zc_objset_type	compressed space in bytes
  * zc_perm_action	uncompressed space in bytes
  */
 static int
 zfs_ioc_space_written(zfs_cmd_t *zc)
 {
 	int error;
 	dsl_pool_t *dp;
 	dsl_dataset_t *new, *old;
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 	error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
 	if (error != 0) {
 		dsl_dataset_rele(new, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = dsl_dataset_space_written(old, new, &zc->zc_cookie,
 	    &zc->zc_objset_type, &zc->zc_perm_action);
 	dsl_dataset_rele(old, FTAG);
 	dsl_dataset_rele(new, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 /*
  * innvl: {
  *     "firstsnap" -> snapshot name
  * }
  *
  * outnvl: {
  *     "used" -> space in bytes
  *     "compressed" -> compressed space in bytes
  *     "uncompressed" -> uncompressed space in bytes
  * }
  */
 static int
 zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error;
 	dsl_pool_t *dp;
 	dsl_dataset_t *new, *old;
 	char *firstsnap;
 	uint64_t used, comp, uncomp;
 
 	if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0)
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_pool_hold(lastsnap, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 	error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
 	if (error != 0) {
 		dsl_dataset_rele(new, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
 	dsl_dataset_rele(old, FTAG);
 	dsl_dataset_rele(new, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	fnvlist_add_uint64(outnvl, "used", used);
 	fnvlist_add_uint64(outnvl, "compressed", comp);
 	fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
 	return (error);
 }
 
 static int
 zfs_ioc_jail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
 	    (int)zc->zc_jailid));
 }
 
 static int
 zfs_ioc_unjail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
 	    (int)zc->zc_jailid));
 }
 
 /*
  * innvl: {
  *     "fd" -> file descriptor to write stream to (int32)
  *     (optional) "fromsnap" -> full snap name to send an incremental from
  *     (optional) "largeblockok" -> (value ignored)
  *         indicates that blocks > 128KB are permitted
  *     (optional) "embedok" -> (value ignored)
  *         presence indicates DRR_WRITE_EMBEDDED records are permitted
  * }
  *
  * outnvl is unused
  */
 /* ARGSUSED */
 static int
 zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	cap_rights_t rights;
 	int error;
 	offset_t off;
 	char *fromname = NULL;
 	int fd;
 	boolean_t largeblockok;
 	boolean_t embedok;
 
 	error = nvlist_lookup_int32(innvl, "fd", &fd);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
 	largeblockok = nvlist_exists(innvl, "largeblockok");
 	embedok = nvlist_exists(innvl, "embedok");
 
 	file_t *fp = getf(fd, cap_rights_init(&rights, CAP_READ));
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
 	off = fp->f_offset;
 	error = dmu_send(snapname, fromname, embedok, largeblockok,
 #ifdef illumos
 	    fd, fp->f_vnode, &off);
 #else
 	    fd, fp, &off);
 #endif
 
 #ifdef illumos
 	if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
 		fp->f_offset = off;
 #else
 	fp->f_offset = off;
 #endif
 
 	releasef(fd);
 	return (error);
 }
 
 /*
  * Determine approximately how large a zfs send stream will be -- the number
  * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
  *
  * innvl: {
  *     (optional) "fromsnap" -> full snap name to send an incremental from
  * }
  *
  * outnvl: {
  *     "space" -> bytes of space (uint64)
  * }
  */
 static int
 zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *fromsnap = NULL;
 	dsl_dataset_t *tosnap;
 	int error;
 	char *fromname;
 	uint64_t space;
 
 	error = dsl_pool_hold(snapname, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = nvlist_lookup_string(innvl, "fromsnap", &fromname);
 	if (error == 0) {
 		error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
 		if (error != 0) {
 			dsl_dataset_rele(tosnap, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 	}
 
 	error = dmu_send_estimate(tosnap, fromsnap, &space);
 	fnvlist_add_uint64(outnvl, "space", space);
 
 	if (fromsnap != NULL)
 		dsl_dataset_rele(fromsnap, FTAG);
 	dsl_dataset_rele(tosnap, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 
 static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
 
 static void
 zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
     boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
 {
 	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
 
 	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
 	ASSERT3U(ioc, <, ZFS_IOC_LAST);
 	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
 	ASSERT3P(vec->zvec_func, ==, NULL);
 
 	vec->zvec_legacy_func = func;
 	vec->zvec_secpolicy = secpolicy;
 	vec->zvec_namecheck = namecheck;
 	vec->zvec_allow_log = log_history;
 	vec->zvec_pool_check = pool_check;
 }
 
 /*
  * See the block comment at the beginning of this file for details on
  * each argument to this function.
  */
 static void
 zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
     zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
     boolean_t allow_log)
 {
 	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
 
 	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
 	ASSERT3U(ioc, <, ZFS_IOC_LAST);
 	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
 	ASSERT3P(vec->zvec_func, ==, NULL);
 
 	/* if we are logging, the name must be valid */
 	ASSERT(!allow_log || namecheck != NO_NAME);
 
 	vec->zvec_name = name;
 	vec->zvec_func = func;
 	vec->zvec_secpolicy = secpolicy;
 	vec->zvec_namecheck = namecheck;
 	vec->zvec_pool_check = pool_check;
 	vec->zvec_smush_outnvlist = smush_outnvlist;
 	vec->zvec_allow_log = allow_log;
 }
 
 static void
 zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
     zfs_ioc_poolcheck_t pool_check)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    POOL_NAME, log_history, pool_check);
 }
 
 static void
 zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_FALSE, pool_check);
 }
 
 static void
 zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
 {
 	zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
 	    POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 }
 
 static void
 zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    NO_NAME, B_FALSE, POOL_CHECK_NONE);
 }
 
 static void
 zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
     zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
 }
 
 static void
 zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
 {
 	zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
 	    zfs_secpolicy_read);
 }
 
 static void
 zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
 	zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 }
 
 static void
 zfs_ioctl_init(void)
 {
 	zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
 	    zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
 	    zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
 	    zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
 	    zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
 	    zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("create", ZFS_IOC_CREATE,
 	    zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("clone", ZFS_IOC_CLONE,
 	    zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
 	    zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("hold", ZFS_IOC_HOLD,
 	    zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 	zfs_ioctl_register("release", ZFS_IOC_RELEASE,
 	    zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
 	    zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
 	    zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
 
 	zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
 	    zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
 	    zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
 	    zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
 	    POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
 	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
 	    zfs_ioc_pool_scan);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
 	    zfs_ioc_pool_upgrade);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
 	    zfs_ioc_vdev_add);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
 	    zfs_ioc_vdev_remove);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
 	    zfs_ioc_vdev_set_state);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
 	    zfs_ioc_vdev_attach);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
 	    zfs_ioc_vdev_detach);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
 	    zfs_ioc_vdev_setpath);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
 	    zfs_ioc_vdev_setfru);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
 	    zfs_ioc_pool_set_props);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
 	    zfs_ioc_vdev_split);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
 	    zfs_ioc_pool_reguid);
 
 	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
 	    zfs_ioc_pool_configs, zfs_secpolicy_none);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
 	    zfs_ioc_pool_tryimport, zfs_secpolicy_config);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
 	    zfs_ioc_inject_fault, zfs_secpolicy_inject);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
 	    zfs_ioc_clear_fault, zfs_secpolicy_inject);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
 	    zfs_ioc_inject_list_next, zfs_secpolicy_inject);
 
 	/*
 	 * pool destroy, and export don't log the history as part of
 	 * zfsdev_ioctl, but rather zfs_ioc_pool_export
 	 * does the logging of those commands.
 	 */
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
 	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
 	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
 	    zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
 	    zfs_ioc_dsobj_to_dsname,
 	    zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
 	    zfs_ioc_pool_get_history,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
 
 	zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
 	    zfs_ioc_space_written);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
 	    zfs_ioc_objset_recvd_props);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
 	    zfs_ioc_next_obj);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
 	    zfs_ioc_get_fsacl);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
 	    zfs_ioc_objset_stats);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
 	    zfs_ioc_objset_zplprops);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
 	    zfs_ioc_dataset_list_next);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	    zfs_ioc_snapshot_list_next);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
 	    zfs_ioc_send_progress);
 
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
 	    zfs_ioc_diff, zfs_secpolicy_diff);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
 	    zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
 	    zfs_ioc_obj_to_path, zfs_secpolicy_diff);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
 	    zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
 	    zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
 	    zfs_ioc_send, zfs_secpolicy_send);
 
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
 	    zfs_secpolicy_none);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
 	    zfs_secpolicy_destroy);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
 	    zfs_secpolicy_rename);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
 	    zfs_secpolicy_recv);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
 	    zfs_secpolicy_promote);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
 	    zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
 	    zfs_secpolicy_set_fsacl);
 
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
 	    zfs_secpolicy_share, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
 	    zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
 	    zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
 	    zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 
 #ifdef __FreeBSD__
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
 	    zfs_secpolicy_config, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
 	    zfs_secpolicy_config, POOL_CHECK_NONE);
 #endif
 }
 
 int
 pool_status_check(const char *name, zfs_ioc_namecheck_t type,
     zfs_ioc_poolcheck_t check)
 {
 	spa_t *spa;
 	int error;
 
 	ASSERT(type == POOL_NAME || type == DATASET_NAME);
 
 	if (check & POOL_CHECK_NONE)
 		return (0);
 
 	error = spa_open(name, &spa, FTAG);
 	if (error == 0) {
 		if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
 			error = SET_ERROR(EAGAIN);
 		else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
 			error = SET_ERROR(EROFS);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 /*
  * Find a free minor number.
  */
 minor_t
 zfsdev_minor_alloc(void)
 {
 	static minor_t last_minor;
 	minor_t m;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	for (m = last_minor + 1; m != last_minor; m++) {
 		if (m > ZFSDEV_MAX_MINOR)
 			m = 1;
 		if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
 			last_minor = m;
 			return (m);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zfs_ctldev_init(struct cdev *devp)
 {
 	minor_t minor;
 	zfs_soft_state_t *zs;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	minor = zfsdev_minor_alloc();
 	if (minor == 0)
 		return (SET_ERROR(ENXIO));
 
 	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
 		return (SET_ERROR(EAGAIN));
 
 	devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
 
 	zs = ddi_get_soft_state(zfsdev_state, minor);
 	zs->zss_type = ZSST_CTLDEV;
 	zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
 
 	return (0);
 }
 
 static void
 zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	zfs_onexit_destroy(zo);
 	ddi_soft_state_free(zfsdev_state, minor);
 }
 
 void *
 zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
 {
 	zfs_soft_state_t *zp;
 
 	zp = ddi_get_soft_state(zfsdev_state, minor);
 	if (zp == NULL || zp->zss_type != which)
 		return (NULL);
 
 	return (zp->zss_data);
 }
 
 static int
 zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
 {
 	int error = 0;
 
 #ifdef sun
 	if (getminor(*devp) != 0)
 		return (zvol_open(devp, flag, otyp, cr));
 #endif
 
 	/* This is the control device. Allocate a new minor if requested. */
 	if (flag & FEXCL) {
 		mutex_enter(&spa_namespace_lock);
 		error = zfs_ctldev_init(devp);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	return (error);
 }
 
 static void
 zfsdev_close(void *data)
 {
 	zfs_onexit_t *zo;
 	minor_t minor = (minor_t)(uintptr_t)data;
 
 	if (minor == 0)
 		return;
 
 	mutex_enter(&spa_namespace_lock);
 	zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
 	if (zo == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return;
 	}
 	zfs_ctldev_destroy(zo, minor);
 	mutex_exit(&spa_namespace_lock);
 }
 
 static int
 zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
     struct thread *td)
 {
 	zfs_cmd_t *zc;
 	uint_t vecnum;
 	int error, rc, len;
 #ifdef illumos
 	minor_t minor = getminor(dev);
 #else
 	zfs_iocparm_t *zc_iocparm;
 	int cflag, cmd, oldvecnum;
 	boolean_t newioc, compat;
 	void *compat_zc = NULL;
 	cred_t *cr = td->td_ucred;
 #endif
 	const zfs_ioc_vec_t *vec;
 	char *saved_poolname = NULL;
 	nvlist_t *innvl = NULL;
 
 	cflag = ZFS_CMD_COMPAT_NONE;
 	compat = B_FALSE;
 	newioc = B_TRUE;	/* "new" style (zfs_iocparm_t) ioctl */
 
 	len = IOCPARM_LEN(zcmd);
 	vecnum = cmd = zcmd & 0xff;
 
 	/*
 	 * Check if we are talking to supported older binaries
 	 * and translate zfs_cmd if necessary
 	 */
 	if (len != sizeof(zfs_iocparm_t)) {
 		newioc = B_FALSE;
 		compat = B_TRUE;
 
 		vecnum = cmd;
 
 		switch (len) {
 		case sizeof(zfs_cmd_zcmd_t):
 			cflag = ZFS_CMD_COMPAT_LZC;
 			break;
 		case sizeof(zfs_cmd_deadman_t):
 			cflag = ZFS_CMD_COMPAT_DEADMAN;
 			break;
 		case sizeof(zfs_cmd_v28_t):
 			cflag = ZFS_CMD_COMPAT_V28;
 			break;
 		case sizeof(zfs_cmd_v15_t):
 			cflag = ZFS_CMD_COMPAT_V15;
 			vecnum = zfs_ioctl_v15_to_v28[cmd];
 
 			/*
 			 * Return without further handling
 			 * if the command is blacklisted.
 			 */
 			if (vecnum == ZFS_IOC_COMPAT_PASS)
 				return (0);
 			else if (vecnum == ZFS_IOC_COMPAT_FAIL)
 				return (ENOTSUP);
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 #ifdef illumos
 	vecnum = cmd - ZFS_IOC_FIRST;
 	ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
 #endif
 
 	if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
 		return (SET_ERROR(EINVAL));
 	vec = &zfs_ioc_vec[vecnum];
 
 	zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
 
 #ifdef illumos
 	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
 	if (error != 0) {
 		error = SET_ERROR(EFAULT);
 		goto out;
 	}
 #else	/* !illumos */
 	bzero(zc, sizeof(zfs_cmd_t));
 
 	if (newioc) {
 		zc_iocparm = (void *)arg;
 
 		switch (zc_iocparm->zfs_ioctl_version) {
 		case ZFS_IOCVER_CURRENT:
 			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
 				error = SET_ERROR(EINVAL);
 				goto out;
 			}
 			break;
 		case ZFS_IOCVER_ZCMD:
 			if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
 			    zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
 				error = SET_ERROR(EFAULT);
 				goto out;
 			}
 			compat = B_TRUE;
 			cflag = ZFS_CMD_COMPAT_ZCMD;
 			break;
 		default:
 			error = SET_ERROR(EINVAL);
 			goto out;
 			/* NOTREACHED */
 		}
 
 		if (compat) {
 			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
 			compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
 			bzero(compat_zc, sizeof(zfs_cmd_t));
 
 			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
 			    compat_zc, zc_iocparm->zfs_cmd_size, flag);
 			if (error != 0) {
 				error = SET_ERROR(EFAULT);
 				goto out;
 			}
 		} else {
 			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
 			    zc, zc_iocparm->zfs_cmd_size, flag);
 			if (error != 0) {
 				error = SET_ERROR(EFAULT);
 				goto out;
 			}
 		}
 	}
 
 	if (compat) {
 		if (newioc) {
 			ASSERT(compat_zc != NULL);
 			zfs_cmd_compat_get(zc, compat_zc, cflag);
 		} else {
 			ASSERT(compat_zc == NULL);
 			zfs_cmd_compat_get(zc, arg, cflag);
 		}
 		oldvecnum = vecnum;
 		error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
 		if (error != 0)
 			goto out;
 		if (oldvecnum != vecnum)
 			vec = &zfs_ioc_vec[vecnum];
 	}
 #endif	/* !illumos */
 
 	zc->zc_iflags = flag & FKIOCTL;
 	if (zc->zc_nvlist_src_size != 0) {
 		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 		    zc->zc_iflags, &innvl);
 		if (error != 0)
 			goto out;
 	}
 
 	/* rewrite innvl for backwards compatibility */
 	if (compat)
 		innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag);
 
 	/*
 	 * Ensure that all pool/dataset names are valid before we pass down to
 	 * the lower layers.
 	 */
 	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
 	switch (vec->zvec_namecheck) {
 	case POOL_NAME:
 		if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
 			error = SET_ERROR(EINVAL);
 		else
 			error = pool_status_check(zc->zc_name,
 			    vec->zvec_namecheck, vec->zvec_pool_check);
 		break;
 
 	case DATASET_NAME:
 		if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
 			error = SET_ERROR(EINVAL);
 		else
 			error = pool_status_check(zc->zc_name,
 			    vec->zvec_namecheck, vec->zvec_pool_check);
 		break;
 
 	case NO_NAME:
 		break;
 	}
 
 	if (error == 0 && !(flag & FKIOCTL))
 		error = vec->zvec_secpolicy(zc, innvl, cr);
 
 	if (error != 0)
 		goto out;
 
 	/* legacy ioctls can modify zc_name */
 	len = strcspn(zc->zc_name, "/@#") + 1;
 	saved_poolname = kmem_alloc(len, KM_SLEEP);
 	(void) strlcpy(saved_poolname, zc->zc_name, len);
 
 	if (vec->zvec_func != NULL) {
 		nvlist_t *outnvl;
 		int puterror = 0;
 		spa_t *spa;
 		nvlist_t *lognv = NULL;
 
 		ASSERT(vec->zvec_legacy_func == NULL);
 
 		/*
 		 * Add the innvl to the lognv before calling the func,
 		 * in case the func changes the innvl.
 		 */
 		if (vec->zvec_allow_log) {
 			lognv = fnvlist_alloc();
 			fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
 			    vec->zvec_name);
 			if (!nvlist_empty(innvl)) {
 				fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
 				    innvl);
 			}
 		}
 
 		outnvl = fnvlist_alloc();
 		error = vec->zvec_func(zc->zc_name, innvl, outnvl);
 
 		if (error == 0 && vec->zvec_allow_log &&
 		    spa_open(zc->zc_name, &spa, FTAG) == 0) {
 			if (!nvlist_empty(outnvl)) {
 				fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
 				    outnvl);
 			}
 			(void) spa_history_log_nvl(spa, lognv);
 			spa_close(spa, FTAG);
 		}
 		fnvlist_free(lognv);
 
 		/* rewrite outnvl for backwards compatibility */
 		if (compat)
 			outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
 			    cflag);
 
 		if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
 			int smusherror = 0;
 			if (vec->zvec_smush_outnvlist) {
 				smusherror = nvlist_smush(outnvl,
 				    zc->zc_nvlist_dst_size);
 			}
 			if (smusherror == 0)
 				puterror = put_nvlist(zc, outnvl);
 		}
 
 		if (puterror != 0)
 			error = puterror;
 
 		nvlist_free(outnvl);
 	} else {
 		error = vec->zvec_legacy_func(zc);
 	}
 
 out:
 	nvlist_free(innvl);
 
 #ifdef illumos
 	rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
 	if (error == 0 && rc != 0)
 		error = SET_ERROR(EFAULT);
 #else
 	if (compat) {
 		zfs_ioctl_compat_post(zc, cmd, cflag);
 		if (newioc) {
 			ASSERT(compat_zc != NULL);
 			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
 
 			zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
 			rc = ddi_copyout(compat_zc,
 			    (void *)(uintptr_t)zc_iocparm->zfs_cmd,
 			    zc_iocparm->zfs_cmd_size, flag);
 			if (error == 0 && rc != 0)
 				error = SET_ERROR(EFAULT);
 			kmem_free(compat_zc, sizeof (zfs_cmd_t));
 		} else {
 			zfs_cmd_compat_put(zc, arg, vecnum, cflag);
 		}
 	} else {
 		ASSERT(newioc);
 
 		rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
 		    sizeof (zfs_cmd_t), flag);
 		if (error == 0 && rc != 0)
 			error = SET_ERROR(EFAULT);
 	}
 #endif
 	if (error == 0 && vec->zvec_allow_log) {
 		char *s = tsd_get(zfs_allow_log_key);
 		if (s != NULL)
 			strfree(s);
 		(void) tsd_set(zfs_allow_log_key, saved_poolname);
 	} else {
 		if (saved_poolname != NULL)
 			strfree(saved_poolname);
 	}
 
 	kmem_free(zc, sizeof (zfs_cmd_t));
 	return (error);
 }
 
 #ifdef sun
 static int
 zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
 	if (cmd != DDI_ATTACH)
 		return (DDI_FAILURE);
 
 	if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
 	    DDI_PSEUDO, 0) == DDI_FAILURE)
 		return (DDI_FAILURE);
 
 	zfs_dip = dip;
 
 	ddi_report_dev(dip);
 
 	return (DDI_SUCCESS);
 }
 
 static int
 zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
 	if (spa_busy() || zfs_busy() || zvol_busy())
 		return (DDI_FAILURE);
 
 	if (cmd != DDI_DETACH)
 		return (DDI_FAILURE);
 
 	zfs_dip = NULL;
 
 	ddi_prop_remove_all(dip);
 	ddi_remove_minor_node(dip, NULL);
 
 	return (DDI_SUCCESS);
 }
 
 /*ARGSUSED*/
 static int
 zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 {
 	switch (infocmd) {
 	case DDI_INFO_DEVT2DEVINFO:
 		*result = zfs_dip;
 		return (DDI_SUCCESS);
 
 	case DDI_INFO_DEVT2INSTANCE:
 		*result = (void *)0;
 		return (DDI_SUCCESS);
 	}
 
 	return (DDI_FAILURE);
 }
 #endif	/* sun */
 
 /*
  * OK, so this is a little weird.
  *
  * /dev/zfs is the control node, i.e. minor 0.
  * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
  *
  * /dev/zfs has basically nothing to do except serve up ioctls,
  * so most of the standard driver entry points are in zvol.c.
  */
 #ifdef sun
 static struct cb_ops zfs_cb_ops = {
 	zfsdev_open,	/* open */
 	zfsdev_close,	/* close */
 	zvol_strategy,	/* strategy */
 	nodev,		/* print */
 	zvol_dump,	/* dump */
 	zvol_read,	/* read */
 	zvol_write,	/* write */
 	zfsdev_ioctl,	/* ioctl */
 	nodev,		/* devmap */
 	nodev,		/* mmap */
 	nodev,		/* segmap */
 	nochpoll,	/* poll */
 	ddi_prop_op,	/* prop_op */
 	NULL,		/* streamtab */
 	D_NEW | D_MP | D_64BIT,		/* Driver compatibility flag */
 	CB_REV,		/* version */
 	nodev,		/* async read */
 	nodev,		/* async write */
 };
 
 static struct dev_ops zfs_dev_ops = {
 	DEVO_REV,	/* version */
 	0,		/* refcnt */
 	zfs_info,	/* info */
 	nulldev,	/* identify */
 	nulldev,	/* probe */
 	zfs_attach,	/* attach */
 	zfs_detach,	/* detach */
 	nodev,		/* reset */
 	&zfs_cb_ops,	/* driver operations */
 	NULL,		/* no bus operations */
 	NULL,		/* power */
 	ddi_quiesce_not_needed,	/* quiesce */
 };
 
 static struct modldrv zfs_modldrv = {
 	&mod_driverops,
 	"ZFS storage pool",
 	&zfs_dev_ops
 };
 
 static struct modlinkage modlinkage = {
 	MODREV_1,
 	(void *)&zfs_modlfs,
 	(void *)&zfs_modldrv,
 	NULL
 };
 #endif	/* sun */
 
 static struct cdevsw zfs_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	zfsdev_open,
 	.d_ioctl =	zfsdev_ioctl,
 	.d_name =	ZFS_DEV_NAME
 };
 
 static void
 zfs_allow_log_destroy(void *arg)
 {
 	char *poolname = arg;
 	strfree(poolname);
 }
 
 static void
 zfsdev_init(void)
 {
 	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
 	    ZFS_DEV_NAME);
 }
 
 static void
 zfsdev_fini(void)
 {
 	if (zfsdev != NULL)
 		destroy_dev(zfsdev);
 }
 
 static struct root_hold_token *zfs_root_token;
 struct proc *zfsproc;
 
 #ifdef sun
 int
 _init(void)
 {
 	int error;
 
 	spa_init(FREAD | FWRITE);
 	zfs_init();
 	zvol_init();
 	zfs_ioctl_init();
 
 	if ((error = mod_install(&modlinkage)) != 0) {
 		zvol_fini();
 		zfs_fini();
 		spa_fini();
 		return (error);
 	}
 
 	tsd_create(&zfs_fsyncer_key, NULL);
 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
 
 	error = ldi_ident_from_mod(&modlinkage, &zfs_li);
 	ASSERT(error == 0);
 	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	return (0);
 }
 
 int
 _fini(void)
 {
 	int error;
 
 	if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
 		return (SET_ERROR(EBUSY));
 
 	if ((error = mod_remove(&modlinkage)) != 0)
 		return (error);
 
 	zvol_fini();
 	zfs_fini();
 	spa_fini();
 	if (zfs_nfsshare_inited)
 		(void) ddi_modclose(nfs_mod);
 	if (zfs_smbshare_inited)
 		(void) ddi_modclose(smbsrv_mod);
 	if (zfs_nfsshare_inited || zfs_smbshare_inited)
 		(void) ddi_modclose(sharefs_mod);
 
 	tsd_destroy(&zfs_fsyncer_key);
 	ldi_ident_release(zfs_li);
 	zfs_li = NULL;
 	mutex_destroy(&zfs_share_lock);
 
 	return (error);
 }
 
 int
 _info(struct modinfo *modinfop)
 {
 	return (mod_info(&modlinkage, modinfop));
 }
 #endif	/* sun */
 
 static int zfs__init(void);
 static int zfs__fini(void);
 static void zfs_shutdown(void *, int);
 
 static eventhandler_tag zfs_shutdown_event_tag;
 
 int
 zfs__init(void)
 {
 
 	zfs_root_token = root_mount_hold("ZFS");
 
 	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	spa_init(FREAD | FWRITE);
 	zfs_init();
 	zvol_init();
 	zfs_ioctl_init();
 
 	tsd_create(&zfs_fsyncer_key, NULL);
 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
 
 	printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
 	root_mount_rel(zfs_root_token);
 
 	zfsdev_init();
 
 	return (0);
 }
 
 int
 zfs__fini(void)
 {
 	if (spa_busy() || zfs_busy() || zvol_busy() ||
 	    zio_injection_enabled) {
 		return (EBUSY);
 	}
 
 	zfsdev_fini();
 	zvol_fini();
 	zfs_fini();
 	spa_fini();
 
 	tsd_destroy(&zfs_fsyncer_key);
 	tsd_destroy(&rrw_tsd_key);
 	tsd_destroy(&zfs_allow_log_key);
 
 	mutex_destroy(&zfs_share_lock);
 
 	return (0);
 }
 
 static void
 zfs_shutdown(void *arg __unused, int howto __unused)
 {
 
 	/*
 	 * ZFS fini routines can not properly work in a panic-ed system.
 	 */
 	if (panicstr == NULL)
 		(void)zfs__fini();
 }
 
 
 static int
 zfs_modevent(module_t mod, int type, void *unused __unused)
 {
 	int err;
 
 	switch (type) {
 	case MOD_LOAD:
 		err = zfs__init();
 		if (err == 0)
 			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
 			    shutdown_post_sync, zfs_shutdown, NULL,
 			    SHUTDOWN_PRI_FIRST);
 		return (err);
 	case MOD_UNLOAD:
 		err = zfs__fini();
 		if (err == 0 && zfs_shutdown_event_tag != NULL)
 			EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 			    zfs_shutdown_event_tag);
 		return (err);
 	case MOD_SHUTDOWN:
 		return (0);
 	default:
 		break;
 	}
 	return (EOPNOTSUPP);
 }
 
 static moduledata_t zfs_mod = {
 	"zfsctrl",
 	zfs_modevent,
 	0
 };
 DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(zfsctrl, 1);
 MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
 MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
 MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
Index: head/sys/cddl/contrib/opensolaris
===================================================================
--- head/sys/cddl/contrib/opensolaris	(revision 275781)
+++ head/sys/cddl/contrib/opensolaris	(revision 275782)

Property changes on: head/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor-sys/illumos/dist:r275551