Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c	(revision 296518)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c	(revision 296519)
@@ -1,3806 +1,3807 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <stdio.h>
 #include <unistd.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_pool.h>
 #include <sys/dbuf.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/dmu_traverse.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/zfeature.h>
 #include <zfs_comutil.h>
 #undef ZFS_MAXNAMELEN
 #undef verify
 #include <libzfs.h>
 
 #define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
 	zio_compress_table[(idx)].ci_name : "UNKNOWN")
 #define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
 	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
 #define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ?	\
 	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ?	\
 	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
 #define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
 	(((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ?	\
 	DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
 
 #ifndef lint
 extern boolean_t zfs_recover;
 extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
 extern int zfs_vdev_async_read_max_active;
 #else
 boolean_t zfs_recover;
 uint64_t zfs_arc_max, zfs_arc_meta_limit;
 int zfs_vdev_async_read_max_active;
 #endif
 
 const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
 
 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
 
 extern void dump_intent_log(zilog_t *);
 static uint64_t *zopt_object = NULL;
 static int zopt_objects = 0;
 static libzfs_handle_t *g_zfs;
 static uint64_t max_inflight = 1000;
 
 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init()
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 static void
 usage(void)
 {
 	(void) fprintf(stderr,
 	    "Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
 	    "[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n"
 	    "       %s [-divPA] [-e -p path...] [-U config] dataset "
 	    "[object...]\n"
 	    "       %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
 	    "poolname [vdev [metaslab...]]\n"
 	    "       %s -R [-A] [-e [-p path...]] poolname "
 	    "vdev:offset:size[:flags]\n"
 	    "       %s -S [-PA] [-e [-p path...]] [-U config] poolname\n"
 	    "       %s -l [-uA] device\n"
 	    "       %s -C [-A] [-U config]\n\n",
 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "    Dataset name must include at least one "
 	    "separator character '/' or '@'\n");
 	(void) fprintf(stderr, "    If dataset name is specified, only that "
 	    "dataset is dumped\n");
 	(void) fprintf(stderr, "    If object numbers are specified, only "
 	    "those objects are dumped\n\n");
 	(void) fprintf(stderr, "    Options to control amount of output:\n");
 	(void) fprintf(stderr, "        -u uberblock\n");
 	(void) fprintf(stderr, "        -d dataset(s)\n");
 	(void) fprintf(stderr, "        -i intent logs\n");
 	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
 	(void) fprintf(stderr, "        -h pool history\n");
 	(void) fprintf(stderr, "        -b block statistics\n");
 	(void) fprintf(stderr, "        -m metaslabs\n");
 	(void) fprintf(stderr, "        -M metaslab groups\n");
 	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
 	    "all data) blocks\n");
 	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
 	(void) fprintf(stderr, "        -D dedup statistics\n");
 	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
 	(void) fprintf(stderr, "        -v verbose (applies to all others)\n");
 	(void) fprintf(stderr, "        -l dump label contents\n");
 	(void) fprintf(stderr, "        -L disable leak tracking (do not "
 	    "load spacemaps)\n");
 	(void) fprintf(stderr, "        -R read and display block from a "
 	    "device\n\n");
 	(void) fprintf(stderr, "    Below options are intended for use "
 	    "with other options:\n");
 	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
 	    "panic recovery (-AA) or both (-AAA)\n");
 	(void) fprintf(stderr, "        -F attempt automatic rewind within "
 	    "safe range of transaction groups\n");
 	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
 	    "cachefile\n");
 	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
 	    "work with dataset)\n");
 	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
 	    "has altroot/not in a cachefile\n");
 	(void) fprintf(stderr, "        -p <path> -- use one or more with "
 	    "-e to specify path to vdev dir\n");
 	(void) fprintf(stderr, "        -x <dumpdir> -- "
 	    "dump all read blocks into specified directory\n");
 	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
 	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
 	    "searching for uberblocks\n");
 	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
 	    "specify the maximum number of "
 	    "checksumming I/Os [default is 200]\n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	exit(1);
 }
 
 /*
  * Called for usage errors that are discovered after a call to spa_open(),
  * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
  */
 
 static void
 fatal(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void) fprintf(stderr, "%s: ", cmdname);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
 	exit(1);
 }
 
 /* ARGSUSED */
 static void
 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	nvlist_t *nv;
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
 	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
 
 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
 
 	umem_free(packed, nvsize);
 
 	dump_nvlist(nv, 8);
 
 	nvlist_free(nv);
 }
 
 /* ARGSUSED */
 static void
 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	spa_history_phys_t *shp = data;
 
 	if (shp == NULL)
 		return;
 
 	(void) printf("\t\tpool_create_len = %llu\n",
 	    (u_longlong_t)shp->sh_pool_create_len);
 	(void) printf("\t\tphys_max_off = %llu\n",
 	    (u_longlong_t)shp->sh_phys_max_off);
 	(void) printf("\t\tbof = %llu\n",
 	    (u_longlong_t)shp->sh_bof);
 	(void) printf("\t\teof = %llu\n",
 	    (u_longlong_t)shp->sh_eof);
 	(void) printf("\t\trecords_lost = %llu\n",
 	    (u_longlong_t)shp->sh_records_lost);
 }
 
 static void
 zdb_nicenum(uint64_t num, char *buf)
 {
 	if (dump_opt['P'])
 		(void) sprintf(buf, "%llu", (longlong_t)num);
 	else
 		nicenum(num, buf);
 }
 
 const char histo_stars[] = "****************************************";
 const int histo_width = sizeof (histo_stars) - 1;
 
 static void
 dump_histogram(const uint64_t *histo, int size, int offset)
 {
 	int i;
 	int minidx = size - 1;
 	int maxidx = 0;
 	uint64_t max = 0;
 
 	for (i = 0; i < size; i++) {
 		if (histo[i] > max)
 			max = histo[i];
 		if (histo[i] > 0 && i > maxidx)
 			maxidx = i;
 		if (histo[i] > 0 && i < minidx)
 			minidx = i;
 	}
 
 	if (max < histo_width)
 		max = histo_width;
 
 	for (i = minidx; i <= maxidx; i++) {
 		(void) printf("\t\t\t%3u: %6llu %s\n",
 		    i + offset, (u_longlong_t)histo[i],
 		    &histo_stars[(max - histo[i]) * histo_width / max]);
 	}
 }
 
 static void
 dump_zap_stats(objset_t *os, uint64_t object)
 {
 	int error;
 	zap_stats_t zs;
 
 	error = zap_get_stats(os, object, &zs);
 	if (error)
 		return;
 
 	if (zs.zs_ptrtbl_len == 0) {
 		ASSERT(zs.zs_num_blocks == 1);
 		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
 		    (u_longlong_t)zs.zs_blocksize,
 		    (u_longlong_t)zs.zs_num_entries);
 		return;
 	}
 
 	(void) printf("\tFat ZAP stats:\n");
 
 	(void) printf("\t\tPointer table:\n");
 	(void) printf("\t\t\t%llu elements\n",
 	    (u_longlong_t)zs.zs_ptrtbl_len);
 	(void) printf("\t\t\tzt_blk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
 	(void) printf("\t\t\tzt_numblks: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
 	(void) printf("\t\t\tzt_shift: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
 	(void) printf("\t\t\tzt_blks_copied: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
 	(void) printf("\t\t\tzt_nextblk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
 
 	(void) printf("\t\tZAP entries: %llu\n",
 	    (u_longlong_t)zs.zs_num_entries);
 	(void) printf("\t\tLeaf blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_leafs);
 	(void) printf("\t\tTotal blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_blocks);
 	(void) printf("\t\tzap_block_type: 0x%llx\n",
 	    (u_longlong_t)zs.zs_block_type);
 	(void) printf("\t\tzap_magic: 0x%llx\n",
 	    (u_longlong_t)zs.zs_magic);
 	(void) printf("\t\tzap_salt: 0x%llx\n",
 	    (u_longlong_t)zs.zs_salt);
 
 	(void) printf("\t\tLeafs with 2^n pointers:\n");
 	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks with n*5 entries:\n");
 	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks n/10 full:\n");
 	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tEntries with n chunks:\n");
 	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBuckets with n entries:\n");
 	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
 }
 
 /*ARGSUSED*/
 static void
 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) printf("\tUNKNOWN OBJECT TYPE\n");
 }
 
 /*ARGSUSED*/
 void
 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	void *prop;
 	int i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		prop = umem_zalloc(attr.za_num_integers *
 		    attr.za_integer_length, UMEM_NOFAIL);
 		(void) zap_lookup(os, object, attr.za_name,
 		    attr.za_integer_length, attr.za_num_integers, prop);
 		if (attr.za_integer_length == 1) {
 			(void) printf("%s", (char *)prop);
 		} else {
 			for (i = 0; i < attr.za_num_integers; i++) {
 				switch (attr.za_integer_length) {
 				case 2:
 					(void) printf("%u ",
 					    ((uint16_t *)prop)[i]);
 					break;
 				case 4:
 					(void) printf("%u ",
 					    ((uint32_t *)prop)[i]);
 					break;
 				case 8:
 					(void) printf("%lld ",
 					    (u_longlong_t)((int64_t *)prop)[i]);
 					break;
 				}
 			}
 		}
 		(void) printf("\n");
 		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 }
 
 static void
 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	bpobj_phys_t *bpop = data;
 	char bytes[32], comp[32], uncomp[32];
 
 	if (bpop == NULL)
 		return;
 
 	zdb_nicenum(bpop->bpo_bytes, bytes);
 	zdb_nicenum(bpop->bpo_comp, comp);
 	zdb_nicenum(bpop->bpo_uncomp, uncomp);
 
 	(void) printf("\t\tnum_blkptrs = %llu\n",
 	    (u_longlong_t)bpop->bpo_num_blkptrs);
 	(void) printf("\t\tbytes = %s\n", bytes);
 	if (size >= BPOBJ_SIZE_V1) {
 		(void) printf("\t\tcomp = %s\n", comp);
 		(void) printf("\t\tuncomp = %s\n", uncomp);
 	}
 	if (size >= sizeof (*bpop)) {
 		(void) printf("\t\tsubobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_subobjs);
 		(void) printf("\t\tnum_subobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_subobjs);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) {
 		char blkbuf[BP_SPRINTF_LEN];
 		blkptr_t bp;
 
 		int err = dmu_read(os, object,
 		    i * sizeof (bp), sizeof (bp), &bp, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			break;
 		}
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
 }
 
 /* ARGSUSED */
 static void
 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dmu_object_info_t doi;
 
 	VERIFY0(dmu_object_info(os, object, &doi));
 	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
 
 	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
 	if (err != 0) {
 		(void) printf("got error %u from dmu_read\n", err);
 		kmem_free(subobjs, doi.doi_max_offset);
 		return;
 	}
 
 	int64_t last_nonzero = -1;
 	for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) {
 		if (subobjs[i] != 0)
 			last_nonzero = i;
 	}
 
 	for (int64_t i = 0; i <= last_nonzero; i++) {
 		(void) printf("\t%llu\n", (longlong_t)subobjs[i]);
 	}
 	kmem_free(subobjs, doi.doi_max_offset);
 }
 
 /*ARGSUSED*/
 static void
 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dump_zap_stats(os, object);
 	/* contents are printed elsewhere, properly decoded */
 }
 
 /*ARGSUSED*/
 static void
 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		(void) printf(" %llx : [%d:%d:%d]\n",
 		    (u_longlong_t)attr.za_first_integer,
 		    (int)ATTR_LENGTH(attr.za_first_integer),
 		    (int)ATTR_BSWAP(attr.za_first_integer),
 		    (int)ATTR_NUM(attr.za_first_integer));
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*ARGSUSED*/
 static void
 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	uint16_t *layout_attrs;
 	int i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = [", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 
 		VERIFY(attr.za_integer_length == 2);
 		layout_attrs = umem_zalloc(attr.za_num_integers *
 		    attr.za_integer_length, UMEM_NOFAIL);
 
 		VERIFY(zap_lookup(os, object, attr.za_name,
 		    attr.za_integer_length,
 		    attr.za_num_integers, layout_attrs) == 0);
 
 		for (i = 0; i != attr.za_num_integers; i++)
 			(void) printf(" %d ", (int)layout_attrs[i]);
 		(void) printf("]\n");
 		umem_free(layout_attrs,
 		    attr.za_num_integers * attr.za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*ARGSUSED*/
 static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	const char *typenames[] = {
 		/* 0 */ "not specified",
 		/* 1 */ "FIFO",
 		/* 2 */ "Character Device",
 		/* 3 */ "3 (invalid)",
 		/* 4 */ "Directory",
 		/* 5 */ "5 (invalid)",
 		/* 6 */ "Block Device",
 		/* 7 */ "7 (invalid)",
 		/* 8 */ "Regular File",
 		/* 9 */ "9 (invalid)",
 		/* 10 */ "Symbolic Link",
 		/* 11 */ "11 (invalid)",
 		/* 12 */ "Socket",
 		/* 13 */ "Door",
 		/* 14 */ "Event Port",
 		/* 15 */ "15 (invalid)",
 	};
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = %lld (type: %s)\n",
 		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
 		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
 	}
 	zap_cursor_fini(&zc);
 }
 
 int
 get_dtl_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		space_map_t *sm = vd->vdev_dtl_sm;
 
 		if (sm != NULL &&
 		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 			return (1);
 		return (0);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		refcount += get_dtl_refcount(vd->vdev_child[c]);
 	return (refcount);
 }
 
 int
 get_metaslab_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd && !vd->vdev_removing) {
 		for (int m = 0; m < vd->vdev_ms_count; m++) {
 			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
 
 			if (sm != NULL &&
 			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 				refcount++;
 		}
 	}
 	for (int c = 0; c < vd->vdev_children; c++)
 		refcount += get_metaslab_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 verify_spacemap_refcounts(spa_t *spa)
 {
 	uint64_t expected_refcount = 0;
 	uint64_t actual_refcount;
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
 	    &expected_refcount);
 	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
 	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
 
 	if (expected_refcount != actual_refcount) {
 		(void) printf("space map refcount mismatch: expected %lld != "
 		    "actual %lld\n",
 		    (longlong_t)expected_refcount,
 		    (longlong_t)actual_refcount);
 		return (2);
 	}
 	return (0);
 }
 
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
 	uint64_t alloc, offset, entry;
 	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 			    "INVALID", "INVALID", "INVALID", "INVALID" };
 
 	if (sm == NULL)
 		return;
 
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
 	alloc = 0;
 	for (offset = 0; offset < space_map_length(sm);
 	    offset += sizeof (entry)) {
 		uint8_t mapshift = sm->sm_shift;
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
 		    sizeof (entry), &entry, DMU_READ_PREFETCH));
 		if (SM_DEBUG_DECODE(entry)) {
 
 			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
 			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
 			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
 		} else {
 			(void) printf("\t    [%6llu]    %c  range:"
 			    " %010llx-%010llx  size: %06llx\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
 			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
 			    mapshift) + sm->sm_start),
 			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
 			    mapshift) + sm->sm_start +
 			    (SM_RUN_DECODE(entry) << mapshift)),
 			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
 			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
 				alloc += SM_RUN_DECODE(entry) << mapshift;
 			else
 				alloc -= SM_RUN_DECODE(entry) << mapshift;
 		}
 	}
 	if (alloc != space_map_allocated(sm)) {
 		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
 		    "with space map summary (%llu)\n",
 		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
 	}
 }
 
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
 	char maxbuf[32];
 	range_tree_t *rt = msp->ms_tree;
 	avl_tree_t *t = &msp->ms_size_tree;
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf);
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", avl_numnodes(t), "maxsize", maxbuf,
 	    "freepct", free_pct);
 	(void) printf("\tIn-memory histogram:\n");
 	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslab(metaslab_t *msp)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 	char freebuf[32];
 
 	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf);
 
 	(void) printf(
 	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
 	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
 	    (u_longlong_t)space_map_object(sm), freebuf);
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
 		metaslab_load_wait(msp);
 		if (!msp->ms_loaded) {
 			VERIFY0(metaslab_load(msp));
 			range_tree_stat_verify(msp->ms_tree);
 		}
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	if (dump_opt['m'] > 1 && sm != NULL &&
 	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 		/*
 		 * The space map histogram represents free space in chunks
 		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 		 */
 		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 		    (u_longlong_t)msp->ms_fragmentation);
 		dump_histogram(sm->sm_phys->smp_histogram,
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}
 
 	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
 		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
 
 		mutex_enter(&msp->ms_lock);
 		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 		mutex_exit(&msp->ms_lock);
 	}
 }
 
 static void
 print_vdev_metaslab_header(vdev_t *vd)
 {
 	(void) printf("\tvdev %10llu\n\t%-10s%5llu   %-19s   %-15s   %-10s\n",
 	    (u_longlong_t)vd->vdev_id,
 	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 	    "offset", "spacemap", "free");
 	(void) printf("\t%15s   %19s   %15s   %10s\n",
 	    "---------------", "-------------------",
 	    "---------------", "-------------");
 }
 
 static void
 dump_metaslab_groups(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	uint64_t fragmentation;
 
 	metaslab_class_histogram_verify(mc);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (mg->mg_class != mc)
 			continue;
 
 		metaslab_group_histogram_verify(mg);
 		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
 		    "fragmentation",
 		    (u_longlong_t)tvd->vdev_id,
 		    (u_longlong_t)tvd->vdev_ms_count);
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			(void) printf("%3s\n", "-");
 		} else {
 			(void) printf("%3llu%%\n",
 			    (u_longlong_t)mg->mg_fragmentation);
 		}
 		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 	}
 
 	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
 	fragmentation = metaslab_class_fragmentation(mc);
 	if (fragmentation == ZFS_FRAG_INVALID)
 		(void) printf("\t%3s\n", "-");
 	else
 		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
 	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslabs(spa_t *spa)
 {
 	vdev_t *vd, *rvd = spa->spa_root_vdev;
 	uint64_t m, c = 0, children = rvd->vdev_children;
 
 	(void) printf("\nMetaslabs:\n");
 
 	if (!dump_opt['d'] && zopt_objects > 0) {
 		c = zopt_object[0];
 
 		if (c >= children)
 			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
 
 		if (zopt_objects > 1) {
 			vd = rvd->vdev_child[c];
 			print_vdev_metaslab_header(vd);
 
 			for (m = 1; m < zopt_objects; m++) {
 				if (zopt_object[m] < vd->vdev_ms_count)
 					dump_metaslab(
 					    vd->vdev_ms[zopt_object[m]]);
 				else
 					(void) fprintf(stderr, "bad metaslab "
 					    "number %llu\n",
 					    (u_longlong_t)zopt_object[m]);
 			}
 			(void) printf("\n");
 			return;
 		}
 		children = c + 1;
 	}
 	for (; c < children; c++) {
 		vd = rvd->vdev_child[c];
 		print_vdev_metaslab_header(vd);
 
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
 {
 	const ddt_phys_t *ddp = dde->dde_phys;
 	const ddt_key_t *ddk = &dde->dde_key;
 	char *types[4] = { "ditto", "single", "double", "triple" };
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu %s %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
 		    types[p], blkbuf);
 	}
 }
 
 static void
 dump_dedup_ratio(const ddt_stat_t *dds)
 {
 	double rL, rP, rD, D, dedup, compress, copies;
 
 	if (dds->dds_blocks == 0)
 		return;
 
 	rL = (double)dds->dds_ref_lsize;
 	rP = (double)dds->dds_ref_psize;
 	rD = (double)dds->dds_ref_dsize;
 	D = (double)dds->dds_dsize;
 
 	dedup = rD / D;
 	compress = rL / rP;
 	copies = rD / rP;
 
 	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
 	    "dedup * compress / copies = %.2f\n\n",
 	    dedup, compress, copies, dedup * compress / copies);
 }
 
 static void
 dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 {
 	char name[DDT_NAMELEN];
 	ddt_entry_t dde;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
 	int error;
 
 	error = ddt_object_info(ddt, type, class, &doi);
 
 	if (error == ENOENT)
 		return;
 	ASSERT(error == 0);
 
 	error = ddt_object_count(ddt, type, class, &count);
 	ASSERT(error == 0);
 	if (count == 0)
 		return;
 
 	dspace = doi.doi_physical_blocks_512 << 9;
 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
 	ddt_object_name(ddt, type, class, name);
 
 	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
 	    name,
 	    (u_longlong_t)count,
 	    (u_longlong_t)(dspace / count),
 	    (u_longlong_t)(mspace / count));
 
 	if (dump_opt['D'] < 3)
 		return;
 
 	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
 
 	if (dump_opt['D'] < 4)
 		return;
 
 	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
 		return;
 
 	(void) printf("%s contents:\n\n", name);
 
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
 		dump_dde(ddt, &dde, walk);
 
 	ASSERT(error == ENOENT);
 
 	(void) printf("\n");
 }
 
 static void
 dump_all_ddts(spa_t *spa)
 {
 	ddt_histogram_t ddh_total = { 0 };
 	ddt_stat_t dds_total = { 0 };
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 			for (enum ddt_class class = 0; class < DDT_CLASSES;
 			    class++) {
 				dump_ddt(ddt, type, class);
 			}
 		}
 	}
 
 	ddt_get_dedup_stats(spa, &dds_total);
 
 	if (dds_total.dds_blocks == 0) {
 		(void) printf("All DDTs are empty\n");
 		return;
 	}
 
 	(void) printf("\n");
 
 	if (dump_opt['D'] > 1) {
 		(void) printf("DDT histogram (aggregated over all DDTs):\n");
 		ddt_get_dedup_histogram(spa, &ddh_total);
 		zpool_dump_ddt(&dds_total, &ddh_total);
 	}
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
 	char *prefix = arg;
 
 	(void) printf("%s [%llu,%llu) length %llu\n",
 	    prefix,
 	    (u_longlong_t)start,
 	    (u_longlong_t)(start + size),
 	    (u_longlong_t)(size));
 }
 
 static void
 dump_dtl(vdev_t *vd, int indent)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t required;
 	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
 	char prefix[256];
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 	required = vdev_dtl_required(vd);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
 	(void) printf("\t%*s%s [%s]\n", indent, "",
 	    vd->vdev_path ? vd->vdev_path :
 	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
 	    required ? "DTL-required" : "DTL-expendable");
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		range_tree_t *rt = vd->vdev_dtl[t];
 		if (range_tree_space(rt) == 0)
 			continue;
 		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
 		    indent + 2, "", name[t]);
 		mutex_enter(rt->rt_lock);
 		range_tree_walk(rt, dump_dtl_seg, prefix);
 		mutex_exit(rt->rt_lock);
 		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
 			dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
 /* from spa_history.c: spa_history_create_obj() */
 #define	HIS_BUF_LEN_DEF	(128 << 10)
 #define	HIS_BUF_LEN_MAX	(1 << 30)
 
 static void
 dump_history(spa_t *spa)
 {
 	nvlist_t **events = NULL;
 	char *buf = NULL;
 	uint64_t bufsize = HIS_BUF_LEN_DEF;
 	uint64_t resid, len, off = 0;
 	uint_t num = 0;
 	int error;
 	time_t tsec;
 	struct tm t;
 	char tbuf[30];
 	char internalstr[MAXPATHLEN];
 
 	if ((buf = malloc(bufsize)) == NULL)
 		(void) fprintf(stderr, "Unable to read history: "
 		    "out of memory\n");
 	do {
 		len = bufsize;
 
 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
 			(void) fprintf(stderr, "Unable to read history: "
 			    "error %d\n", error);
 			return;
 		}
 
 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
 			break;
 		off -= resid;
 
 		/*
 		 * If the history block is too big, double the buffer
 		 * size and try again.
 		 */
 		if (resid == len) {
 			free(buf);
 			buf = NULL;
 
 			bufsize <<= 1;
 			if ((bufsize >= HIS_BUF_LEN_MAX) ||
 			    ((buf = malloc(bufsize)) == NULL)) {
 				(void) fprintf(stderr, "Unable to read history: "
 				    "out of memory\n");
 				return;
 			}
 		}
 	} while (len != 0);
 	free(buf);
 
 	(void) printf("\nHistory:\n");
 	for (int i = 0; i < num; i++) {
 		uint64_t time, txg, ievent;
 		char *cmd, *intstr;
 		boolean_t printed = B_FALSE;
 
 		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
 		    &time) != 0)
 			goto next;
 		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
 		    &cmd) != 0) {
 			if (nvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
 				goto next;
 			verify(nvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG, &txg) == 0);
 			verify(nvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR, &intstr) == 0);
 			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
 				goto next;
 
 			(void) snprintf(internalstr,
 			    sizeof (internalstr),
 			    "[internal %s txg:%lld] %s",
 			    zfs_history_event_names[ievent], txg,
 			    intstr);
 			cmd = internalstr;
 		}
 		tsec = time;
 		(void) localtime_r(&tsec, &t);
 		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		(void) printf("%s %s\n", tbuf, cmd);
 		printed = B_TRUE;
 
 next:
 		if (dump_opt['h'] > 1) {
 			if (!printed)
 				(void) printf("unrecognized record:\n");
 			dump_nvlist(events[i], 2);
 		}
 	}
 }
 
 /*ARGSUSED*/
 static void
 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 static uint64_t
 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb)
 {
 	if (dnp == NULL) {
 		ASSERT(zb->zb_level < 0);
 		if (zb->zb_object == 0)
 			return (zb->zb_blkid);
 		return (zb->zb_blkid * BP_GET_LSIZE(bp));
 	}
 
 	ASSERT(zb->zb_level >= 0);
 
 	return ((zb->zb_blkid <<
 	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 
 	if (dump_opt['b'] >= 6) {
 		snprintf_blkptr(blkbuf, buflen, bp);
 		return;
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		(void) sprintf(blkbuf,
 		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
 		    (int)BPE_GET_ETYPE(bp),
 		    (u_longlong_t)BPE_GET_LSIZE(bp),
 		    (u_longlong_t)BPE_GET_PSIZE(bp),
 		    (u_longlong_t)bp->blk_birth);
 		return;
 	}
 
 	blkbuf[0] = '\0';
 	for (int i = 0; i < ndvas; i++)
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
 
 	if (BP_IS_HOLE(bp)) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL B=%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)bp->blk_birth);
 	} else {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL/%llxP F=%llu B=%llu/%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    (u_longlong_t)bp->blk_birth,
 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
 	}
 }
 
 static void
 print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
     const dnode_phys_t *dnp)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 	int l;
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
 		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
 	}
 
 	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
 	ASSERT(zb->zb_level >= 0);
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
 			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
 		} else {
 			(void) printf(" ");
 		}
 	}
 
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	int err = 0;
 
 	if (bp->blk_birth == 0)
 		return (0);
 
 	print_indirect(bp, zb, dnp);
 
 	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 		uint64_t fill = 0;
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
 			return (err);
 		ASSERT(buf->b_data);
 
 		/* recursively visit blocks below this */
 		cbp = buf->b_data;
 		for (i = 0; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = visit_indirect(spa, dnp, cbp, &czb);
 			if (err)
 				break;
 			fill += BP_GET_FILL(cbp);
 		}
 		if (!err)
 			ASSERT3U(fill, ==, BP_GET_FILL(bp));
 		(void) arc_buf_remove_ref(buf, &buf);
 	}
 
 	return (err);
 }
 
 /*ARGSUSED*/
 static void
 dump_indirect(dnode_t *dn)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 	int j;
 	zbookmark_phys_t czb;
 
 	(void) printf("Indirect blocks:\n");
 
 	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
 	    dn->dn_object, dnp->dn_nlevels - 1, 0);
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		czb.zb_blkid = j;
 		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
 		    &dnp->dn_blkptr[j], &czb);
 	}
 
 	(void) printf("\n");
 }
 
 /*ARGSUSED*/
 static void
 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dsl_dir_phys_t *dd = data;
 	time_t crtime;
 	char nice[32];
 
 	if (dd == NULL)
 		return;
 
 	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
 
 	crtime = dd->dd_creation_time;
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\thead_dataset_obj = %llu\n",
 	    (u_longlong_t)dd->dd_head_dataset_obj);
 	(void) printf("\t\tparent_dir_obj = %llu\n",
 	    (u_longlong_t)dd->dd_parent_obj);
 	(void) printf("\t\torigin_obj = %llu\n",
 	    (u_longlong_t)dd->dd_origin_obj);
 	(void) printf("\t\tchild_dir_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_child_dir_zapobj);
 	zdb_nicenum(dd->dd_used_bytes, nice);
 	(void) printf("\t\tused_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_compressed_bytes, nice);
 	(void) printf("\t\tcompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_uncompressed_bytes, nice);
 	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_quota, nice);
 	(void) printf("\t\tquota = %s\n", nice);
 	zdb_nicenum(dd->dd_reserved, nice);
 	(void) printf("\t\treserved = %s\n", nice);
 	(void) printf("\t\tprops_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_props_zapobj);
 	(void) printf("\t\tdeleg_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_deleg_zapobj);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)dd->dd_flags);
 
 #define	DO(which) \
 	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
 	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
 	DO(HEAD);
 	DO(SNAP);
 	DO(CHILD);
 	DO(CHILD_RSRV);
 	DO(REFRSRV);
 #undef DO
 }
 
 /*ARGSUSED*/
 static void
 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dsl_dataset_phys_t *ds = data;
 	time_t crtime;
 	char used[32], compressed[32], uncompressed[32], unique[32];
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (ds == NULL)
 		return;
 
 	ASSERT(size == sizeof (*ds));
 	crtime = ds->ds_creation_time;
 	zdb_nicenum(ds->ds_referenced_bytes, used);
 	zdb_nicenum(ds->ds_compressed_bytes, compressed);
 	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
 	zdb_nicenum(ds->ds_unique_bytes, unique);
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
 	(void) printf("\t\tprev_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_obj);
 	(void) printf("\t\tprev_snap_txg = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_txg);
 	(void) printf("\t\tnext_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_snap_obj);
 	(void) printf("\t\tsnapnames_zapobj = %llu\n",
 	    (u_longlong_t)ds->ds_snapnames_zapobj);
 	(void) printf("\t\tnum_children = %llu\n",
 	    (u_longlong_t)ds->ds_num_children);
 	(void) printf("\t\tuserrefs_obj = %llu\n",
 	    (u_longlong_t)ds->ds_userrefs_obj);
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\tcreation_txg = %llu\n",
 	    (u_longlong_t)ds->ds_creation_txg);
 	(void) printf("\t\tdeadlist_obj = %llu\n",
 	    (u_longlong_t)ds->ds_deadlist_obj);
 	(void) printf("\t\tused_bytes = %s\n", used);
 	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
 	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
 	(void) printf("\t\tunique = %s\n", unique);
 	(void) printf("\t\tfsid_guid = %llu\n",
 	    (u_longlong_t)ds->ds_fsid_guid);
 	(void) printf("\t\tguid = %llu\n",
 	    (u_longlong_t)ds->ds_guid);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)ds->ds_flags);
 	(void) printf("\t\tnext_clones_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_clones_obj);
 	(void) printf("\t\tprops_obj = %llu\n",
 	    (u_longlong_t)ds->ds_props_obj);
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
 /* ARGSUSED */
 static int
 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (bp->blk_birth != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
 	return (0);
 }
 
 static void
 dump_bptree(objset_t *os, uint64_t obj, char *name)
 {
 	char bytes[32];
 	bptree_phys_t *bt;
 	dmu_buf_t *db;
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	zdb_nicenum(bt->bt_bytes, bytes);
 	(void) printf("\n    %s: %llu datasets, %s\n",
 	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
 	dmu_buf_rele(db, FTAG);
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	(void) printf("\n");
 
 	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
 }
 
 /* ARGSUSED */
 static int
 dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	ASSERT(bp->blk_birth != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
 }
 
 static void
 dump_full_bpobj(bpobj_t *bpo, char *name, int indent)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
 		(void) printf("    %*s: object %llu, %llu local blkptrs, "
 		    "%llu subobjs in object %llu, %s (%s/%s comp)\n",
 		    indent * 8, name,
 		    (u_longlong_t)bpo->bpo_object,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 		    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 		    bytes, comp, uncomp);
 
 		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				continue;
 			}
 			dump_full_bpobj(&subbpo, "subobj", indent + 1);
 			bpobj_close(&subbpo);
 		}
 	} else {
 		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
 		    indent * 8, name,
 		    (u_longlong_t)bpo->bpo_object,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 		    bytes);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 
 	if (indent == 0) {
 		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_deadlist(dsl_deadlist_t *dl)
 {
 	dsl_deadlist_entry_t *dle;
 	uint64_t unused;
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	if (dl->dl_oldfmt) {
 		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
 		return;
 	}
 
 	zdb_nicenum(dl->dl_phys->dl_used, bytes);
 	zdb_nicenum(dl->dl_phys->dl_comp, comp);
 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
 	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
 	    bytes, comp, uncomp);
 
 	if (dump_opt['d'] < 4)
 		return;
 
 	(void) printf("\n");
 
 	/* force the tree to be loaded */
 	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
 
 	for (dle = avl_first(&dl->dl_tree); dle;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		if (dump_opt['d'] >= 5) {
 			char buf[128];
 			(void) snprintf(buf, sizeof (buf), "mintxg %llu -> "
 			    "obj %llu", (longlong_t)dle->dle_mintxg,
 			    (longlong_t)dle->dle_bpobj.bpo_object);
 			dump_full_bpobj(&dle->dle_bpobj, buf, 0);
 		} else {
 			(void) printf("mintxg %llu -> obj %llu\n",
 			    (longlong_t)dle->dle_mintxg,
 			    (longlong_t)dle->dle_bpobj.bpo_object);
 		}
 	}
 }
 
 static avl_tree_t idx_tree;
 static avl_tree_t domain_tree;
 static boolean_t fuid_table_loaded;
 static boolean_t sa_loaded;
 sa_attr_type_t *sa_attr_table;
 
 static void
 fuid_table_destroy()
 {
 	if (fuid_table_loaded) {
 		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
 		fuid_table_loaded = B_FALSE;
 	}
 }
 
 /*
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
  * For CIFS files with FUID the fuid is printed in hex followed by
  * the domain-rid string.
  */
 static void
 print_idstr(uint64_t id, const char *id_type)
 {
 	if (FUID_INDEX(id)) {
 		char *domain;
 
 		domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
 		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
 		    (u_longlong_t)id, domain, (int)FUID_RID(id));
 	} else {
 		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
 	}
 
 }
 
 static void
 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
 {
 	uint32_t uid_idx, gid_idx;
 
 	uid_idx = FUID_INDEX(uid);
 	gid_idx = FUID_INDEX(gid);
 
 	/* Load domain table, if not already loaded */
 	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
 		uint64_t fuid_obj;
 
 		/* first find the fuid object.  It lives in the master node */
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
 		    8, 1, &fuid_obj) == 0);
 		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
 		(void) zfs_fuid_table_load(os, fuid_obj,
 		    &idx_tree, &domain_tree);
 		fuid_table_loaded = B_TRUE;
 	}
 
 	print_idstr(uid, "uid");
 	print_idstr(gid, "gid");
 }
 
 /*ARGSUSED*/
 static void
 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
 	sa_handle_t *hdl;
 	uint64_t xattr, rdev, gen;
 	uint64_t uid, gid, mode, fsize, parent, links;
 	uint64_t pflags;
 	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
 	time_t z_crtime, z_atime, z_mtime, z_ctime;
 	sa_bulk_attr_t bulk[12];
 	int idx = 0;
 	int error;
 
 	if (!sa_loaded) {
 		uint64_t sa_attrs = 0;
 		uint64_t version;
 
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 		    8, 1, &version) == 0);
 		if (version >= ZPL_VERSION_SA) {
 			VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
 			    8, 1, &sa_attrs) == 0);
 		}
 		if ((error = sa_setup(os, sa_attrs, zfs_attr_table,
 		    ZPL_END, &sa_attr_table)) != 0) {
 			(void) printf("sa_setup failed errno %d, can't "
 			    "display znode contents\n", error);
 			return;
 		}
 		sa_loaded = B_TRUE;
 	}
 
 	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
 	    &links, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
 	    &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
 	    NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
 	    &fsize, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
 	    acctm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
 	    modtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
 	    crtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
 	    chgtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
 	    &pflags, 8);
 
 	if (sa_bulk_lookup(hdl, bulk, idx)) {
 		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
 	error = zfs_obj_to_path(os, object, path, sizeof (path));
 	if (error != 0) {
 		(void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
 		    (u_longlong_t)object);
 	}
 	if (dump_opt['d'] < 3) {
 		(void) printf("\t%s\n", path);
 		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
 	z_crtime = (time_t)crtm[0];
 	z_atime = (time_t)acctm[0];
 	z_mtime = (time_t)modtm[0];
 	z_ctime = (time_t)chgtm[0];
 
 	(void) printf("\tpath	%s\n", path);
 	dump_uidgid(os, uid, gid);
 	(void) printf("\tatime	%s", ctime(&z_atime));
 	(void) printf("\tmtime	%s", ctime(&z_mtime));
 	(void) printf("\tctime	%s", ctime(&z_ctime));
 	(void) printf("\tcrtime	%s", ctime(&z_crtime));
 	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
 	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
 	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
 	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
 	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
 	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
 	sa_handle_destroy(hdl);
 }
 
 /*ARGSUSED*/
 static void
 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
 	dump_none,		/* unallocated			*/
 	dump_zap,		/* object directory		*/
 	dump_uint64,		/* object array			*/
 	dump_none,		/* packed nvlist		*/
 	dump_packed_nvlist,	/* packed nvlist size		*/
 	dump_none,		/* bpobj			*/
 	dump_bpobj,		/* bpobj header			*/
 	dump_none,		/* SPA space map header		*/
 	dump_none,		/* SPA space map		*/
 	dump_none,		/* ZIL intent log		*/
 	dump_dnode,		/* DMU dnode			*/
 	dump_dmu_objset,	/* DMU objset			*/
 	dump_dsl_dir,		/* DSL directory		*/
 	dump_zap,		/* DSL directory child map	*/
 	dump_zap,		/* DSL dataset snap map		*/
 	dump_zap,		/* DSL props			*/
 	dump_dsl_dataset,	/* DSL dataset			*/
 	dump_znode,		/* ZFS znode			*/
 	dump_acl,		/* ZFS V0 ACL			*/
 	dump_uint8,		/* ZFS plain file		*/
 	dump_zpldir,		/* ZFS directory		*/
 	dump_zap,		/* ZFS master node		*/
 	dump_zap,		/* ZFS delete queue		*/
 	dump_uint8,		/* zvol object			*/
 	dump_zap,		/* zvol prop			*/
 	dump_uint8,		/* other uint8[]		*/
 	dump_uint64,		/* other uint64[]		*/
 	dump_zap,		/* other ZAP			*/
 	dump_zap,		/* persistent error log		*/
 	dump_uint8,		/* SPA history			*/
 	dump_history_offsets,	/* SPA history offsets		*/
 	dump_zap,		/* Pool properties		*/
 	dump_zap,		/* DSL permissions		*/
 	dump_acl,		/* ZFS ACL			*/
 	dump_uint8,		/* ZFS SYSACL			*/
 	dump_none,		/* FUID nvlist			*/
 	dump_packed_nvlist,	/* FUID nvlist size		*/
 	dump_zap,		/* DSL dataset next clones	*/
 	dump_zap,		/* DSL scrub queue		*/
 	dump_zap,		/* ZFS user/group used		*/
 	dump_zap,		/* ZFS user/group quota		*/
 	dump_zap,		/* snapshot refcount tags	*/
 	dump_ddt_zap,		/* DDT ZAP object		*/
 	dump_zap,		/* DDT statistics		*/
 	dump_znode,		/* SA object			*/
 	dump_zap,		/* SA Master Node		*/
 	dump_sa_attrs,		/* SA attribute registration	*/
 	dump_sa_layouts,	/* SA attribute layouts		*/
 	dump_zap,		/* DSL scrub translations	*/
 	dump_none,		/* fake dedup BP		*/
 	dump_zap,		/* deadlist			*/
 	dump_none,		/* deadlist hdr			*/
 	dump_zap,		/* dsl clones			*/
 	dump_bpobj_subobjs,	/* bpobj subobjs		*/
 	dump_unknown,		/* Unknown type, must be last	*/
 };
 
 static void
 dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
 {
 	dmu_buf_t *db = NULL;
 	dmu_object_info_t doi;
 	dnode_t *dn;
 	void *bonus = NULL;
 	size_t bsize = 0;
 	char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
 	char bonus_size[32];
 	char aux[50];
 	int error;
 
 	if (*print_header) {
 		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
 		    "%full", "type");
 		*print_header = 0;
 	}
 
 	if (object == 0) {
 		dn = DMU_META_DNODE(os);
 	} else {
 		error = dmu_bonus_hold(os, object, FTAG, &db);
 		if (error)
 			fatal("dmu_bonus_hold(%llu) failed, errno %u",
 			    object, error);
 		bonus = db->db_data;
 		bsize = db->db_size;
 		dn = DB_DNODE((dmu_buf_impl_t *)db);
 	}
 	dmu_object_info_from_dnode(dn, &doi);
 
 	zdb_nicenum(doi.doi_metadata_block_size, iblk);
 	zdb_nicenum(doi.doi_data_block_size, dblk);
 	zdb_nicenum(doi.doi_max_offset, lsize);
 	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
 	zdb_nicenum(doi.doi_bonus_size, bonus_size);
 	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
 	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
 	    doi.doi_max_offset);
 
 	aux[0] = '\0';
 
 	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
 		    ZDB_CHECKSUM_NAME(doi.doi_checksum));
 	}
 
 	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
 		    ZDB_COMPRESS_NAME(doi.doi_compress));
 	}
 
 	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
 	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
 	    asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
 		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "", "", "", "", "", bonus_size, "bonus",
 		    ZDB_OT_NAME(doi.doi_bonus_type));
 	}
 
 	if (verbosity >= 4) {
 		(void) printf("\tdnode flags: %s%s%s\n",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
 		    "USED_BYTES " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
 		    "USERUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
 		    "SPILL_BLKPTR" : "");
 		(void) printf("\tdnode maxblkid: %llu\n",
 		    (longlong_t)dn->dn_phys->dn_maxblkid);
 
 		object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
 		    bonus, bsize);
 		object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
 		*print_header = 1;
 	}
 
 	if (verbosity >= 5)
 		dump_indirect(dn);
 
 	if (verbosity >= 5) {
 		/*
 		 * Report the list of segments that comprise the object.
 		 */
 		uint64_t start = 0;
 		uint64_t end;
 		uint64_t blkfill = 1;
 		int minlvl = 1;
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			minlvl = 0;
 			blkfill = DNODES_PER_BLOCK;
 		}
 
 		for (;;) {
 			char segsize[32];
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
 				break;
 			end = start;
 			error = dnode_next_offset(dn,
 			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
 			zdb_nicenum(end - start, segsize);
 			(void) printf("\t\tsegment [%016llx, %016llx)"
 			    " size %5s\n", (u_longlong_t)start,
 			    (u_longlong_t)end, segsize);
 			if (error)
 				break;
 			start = end;
 		}
 	}
 
 	if (db != NULL)
 		dmu_buf_rele(db, FTAG);
 }
 
 static char *objset_types[DMU_OST_NUMTYPES] = {
 	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
 
 static void
 dump_dir(objset_t *os)
 {
 	dmu_objset_stats_t dds;
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
 	char numbuf[32];
 	char blkbuf[BP_SPRINTF_LEN + 20];
 	char osname[MAXNAMELEN];
 	char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
 	int print_header = 1;
 	int i, error;
 
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	if (dds.dds_type < DMU_OST_NUMTYPES)
 		type = objset_types[dds.dds_type];
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
 		usedobjs = BP_GET_FILL(os->os_rootbp);
 		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
 		    dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
 	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
 
 	zdb_nicenum(refdbytes, numbuf);
 
 	if (verbosity >= 4) {
 		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
 		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
 		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
 
 	dmu_objset_name(os, osname);
 
 	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
 	    "%s, %llu objects%s\n",
 	    osname, type, (u_longlong_t)dmu_objset_id(os),
 	    (u_longlong_t)dds.dds_creation_txg,
 	    numbuf, (u_longlong_t)usedobjs, blkbuf);
 
 	if (zopt_objects != 0) {
 		for (i = 0; i < zopt_objects; i++)
 			dump_object(os, zopt_object[i], verbosity,
 			    &print_header);
 		(void) printf("\n");
 		return;
 	}
 
 	if (dump_opt['i'] != 0 || verbosity >= 2)
 		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL)
 		dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
 
 	if (verbosity < 2)
 		return;
 
 	if (BP_IS_HOLE(os->os_rootbp))
 		return;
 
 	dump_object(os, 0, verbosity, &print_header);
 	object_count = 0;
 	if (DMU_USERUSED_DNODE(os) != NULL &&
 	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
 	}
 
 	object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
 		dump_object(os, object, verbosity, &print_header);
 		object_count++;
 	}
 
 	ASSERT3U(object_count, ==, usedobjs);
 
 	(void) printf("\n");
 
 	if (error != ESRCH) {
 		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
 		abort();
 	}
 }
 
 static void
 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 {
 	time_t timestamp = ub->ub_timestamp;
 
 	(void) printf(header ? header : "");
 	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
 	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
 	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
 	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
 	(void) printf("\ttimestamp = %llu UTC = %s",
 	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
 	if (dump_opt['u'] >= 3) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
 	(void) printf(footer ? footer : "");
 }
 
 static void
 dump_config(spa_t *spa)
 {
 	dmu_buf_t *db;
 	size_t nvsize = 0;
 	int error = 0;
 
 
 	error = dmu_bonus_hold(spa->spa_meta_objset,
 	    spa->spa_config_object, FTAG, &db);
 
 	if (error == 0) {
 		nvsize = *(uint64_t *)db->db_data;
 		dmu_buf_rele(db, FTAG);
 
 		(void) printf("\nMOS Configuration:\n");
 		dump_packed_nvlist(spa->spa_meta_objset,
 		    spa->spa_config_object, (void *)&nvsize, 1);
 	} else {
 		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
 		    (u_longlong_t)spa->spa_config_object, error);
 	}
 }
 
 static void
 dump_cachefile(const char *cachefile)
 {
 	int fd;
 	struct stat64 statbuf;
 	char *buf;
 	nvlist_t *config;
 
 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if ((buf = malloc(statbuf.st_size)) == NULL) {
 		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		exit(1);
 	}
 
 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
 		(void) fprintf(stderr, "failed to read %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		exit(1);
 	}
 
 	(void) close(fd);
 
 	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
 		(void) fprintf(stderr, "failed to unpack nvlist\n");
 		exit(1);
 	}
 
 	free(buf);
 
 	dump_nvlist(config, 0);
 
 	nvlist_free(config);
 }
 
 #define	ZDB_MAX_UB_HEADER_SIZE 32
 
 static void
 dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
 {
 	vdev_t vd;
 	vdev_t *vdp = &vd;
 	char header[ZDB_MAX_UB_HEADER_SIZE];
 
 	vd.vdev_ashift = ashift;
 	vdp->vdev_top = vdp;
 
 	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
 		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
 		uberblock_t *ub = (void *)((char *)lbl + uoff);
 
 		if (uberblock_verify(ub))
 			continue;
 		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
 		    "Uberblock[%d]\n", i);
 		dump_uberblock(ub, header, "");
 	}
 }
 
 static void
 dump_label(const char *dev)
 {
 	int fd;
 	vdev_label_t label;
 	char *path, *buf = label.vl_vdev_phys.vp_nvlist;
 	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
 	struct stat64 statbuf;
 	uint64_t psize, ashift;
 	int len = strlen(dev) + 1;
 
 	if (strncmp(dev, "/dev/dsk/", 9) == 0) {
 		len++;
 		path = malloc(len);
 		(void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
 	} else {
 		path = strdup(dev);
 	}
 
 	if ((fd = open64(path, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
 		free(path);
 		exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", path,
 		    strerror(errno));
 		free(path);
 		(void) close(fd);
 		exit(1);
 	}
 
 	if (S_ISBLK(statbuf.st_mode)) {
 		(void) printf("cannot use '%s': character device required\n",
 		    path);
 		free(path);
 		(void) close(fd);
 		exit(1);
 	}
 
 	psize = statbuf.st_size;
 	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		nvlist_t *config = NULL;
 
 		(void) printf("--------------------------------------------\n");
 		(void) printf("LABEL %d\n", l);
 		(void) printf("--------------------------------------------\n");
 
 		if (pread64(fd, &label, sizeof (label),
 		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
 			(void) printf("failed to read label %d\n", l);
 			continue;
 		}
 
 		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
 			(void) printf("failed to unpack label %d\n", l);
 			ashift = SPA_MINBLOCKSHIFT;
 		} else {
 			nvlist_t *vdev_tree = NULL;
 
 			dump_nvlist(config, 4);
 			if ((nvlist_lookup_nvlist(config,
 			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
 			    (nvlist_lookup_uint64(vdev_tree,
 			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
 				ashift = SPA_MINBLOCKSHIFT;
 			nvlist_free(config);
 		}
 		if (dump_opt['u'])
 			dump_label_uberblocks(&label, ashift);
 	}
 
 	free(path);
 	(void) close(fd);
 }
 
 static uint64_t dataset_feature_count[SPA_FEATURES];
 
 /*ARGSUSED*/
 static int
 dump_one_dir(const char *dsname, void *arg)
 {
 	int error;
 	objset_t *os;
 
 	error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
 	if (error) {
 		(void) printf("Could not open %s, error %d\n", dsname, error);
 		return (0);
 	}
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (!dmu_objset_ds(os)->ds_feature_inuse[f])
 			continue;
 		ASSERT(spa_feature_table[f].fi_flags &
 		    ZFEATURE_FLAG_PER_DATASET);
 		dataset_feature_count[f]++;
 	}
 
 	dump_dir(os);
 	dmu_objset_disown(os, FTAG);
 	fuid_table_destroy();
 	sa_loaded = B_FALSE;
 	return (0);
 }
 
 /*
  * Block statistics.
  */
 #define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
 	uint64_t zb_asize;
 	uint64_t zb_lsize;
 	uint64_t zb_psize;
 	uint64_t zb_count;
 	uint64_t zb_gangs;
 	uint64_t zb_ditto_samevdev;
 	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
 } zdb_blkstats_t;
 
 /*
  * Extended object types to report deferred frees and dedup auto-ditto blocks.
  */
 #define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
 #define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
 #define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
 #define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
 
 static char *zdb_ot_extname[] = {
 	"deferred free",
 	"dedup ditto",
 	"other",
 	"Total",
 };
 
 #define	ZB_TOTAL	DN_MAX_LEVELS
 
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
 	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
 	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
 	    [BPE_PAYLOAD_SIZE];
 	uint64_t	zcb_start;
 	uint64_t	zcb_lastprint;
 	uint64_t	zcb_totalasize;
 	uint64_t	zcb_errors[256];
 	int		zcb_readfails;
 	int		zcb_haderrors;
 	spa_t		*zcb_spa;
 } zdb_cb_t;
 
 static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	uint64_t refcnt = 0;
 
 	ASSERT(type < ZDB_OT_TOTAL);
 
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
 	for (int i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
 		int equal;
 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
 
 		/*
 		 * The histogram is only big enough to record blocks up to
 		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
 		 * "other", bucket.
 		 */
 		int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
 		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
 		zb->zb_psize_histogram[idx]++;
 
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1]))
 				zb->zb_ditto_samevdev++;
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal != 0)
 				zb->zb_ditto_samevdev++;
 			break;
 		}
 
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
 		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
 		    [BPE_GET_PSIZE(bp)]++;
 		return;
 	}
 
 	if (dump_opt['L'])
 		return;
 
 	if (BP_GET_DEDUP(bp)) {
 		ddt_t *ddt;
 		ddt_entry_t *dde;
 
 		ddt = ddt_select(zcb->zcb_spa, bp);
 		ddt_enter(ddt);
 		dde = ddt_lookup(ddt, bp, B_FALSE);
 
 		if (dde == NULL) {
 			refcnt = 0;
 		} else {
 			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
 			ddt_phys_decref(ddp);
 			refcnt = ddp->ddp_refcnt;
 			if (ddt_phys_total_refcnt(dde) == 0)
 				ddt_remove(ddt, dde);
 		}
 		ddt_exit(ddt);
 	}
 
 	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
 	    refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
 	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
 /* ARGSUSED */
 static void
 zdb_blkptr_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	int ioerr = zio->io_error;
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;
 
 	zio_data_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 
 	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		char blkbuf[BP_SPRINTF_LEN];
 
 		zcb->zcb_haderrors = 1;
 		zcb->zcb_errors[ioerr]++;
 
 		if (dump_opt['b'] >= 2)
 			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		else
 			blkbuf[0] = '\0';
 
 		(void) printf("zdb_blkptr_cb: "
 		    "Got error %d reading "
 		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
 		    ioerr,
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level,
 		    (u_longlong_t)zb->zb_blkid,
 		    blkbuf);
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /* ARGSUSED */
 static int
 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zdb_cb_t *zcb = arg;
 	dmu_object_type_t type;
 	boolean_t is_metadata;
 
 	if (bp == NULL)
 		return (0);
 
 	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
 		    "level %lld offset 0x%llx %s\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (longlong_t)zb->zb_level,
 		    (u_longlong_t)blkid2offset(dnp, bp, zb),
 		    blkbuf);
 	}
 
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	type = BP_GET_TYPE(bp);
 
 	zdb_count_block(zcb, zilog, bp,
 	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
 
 	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
 	if (!BP_IS_EMBEDDED(bp) &&
 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
 		void *data = zio_data_buf_alloc(size);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
 		if (zb->zb_level == ZB_ZIL_LEVEL)
 			flags |= ZIO_FLAG_SPECULATIVE;
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight > max_inflight)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight++;
 		mutex_exit(&spa->spa_scrub_lock);
 
 		zio_nowait(zio_read(NULL, spa, bp, data, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
 	zcb->zcb_readfails = 0;
 
 	/* only call gethrtime() every 100 blocks */
 	static int iters;
 	if (++iters > 100)
 		iters = 0;
 	else
 		return (0);
 
 	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
 		uint64_t now = gethrtime();
 		char buf[10];
 		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
 		int kb_per_sec =
 		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
 		int sec_remaining =
 		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
 
 		zfs_nicenum(bytes, buf, sizeof (buf));
 		(void) fprintf(stderr,
 		    "\r%5s completed (%4dMB/s) "
 		    "estimated time remaining: %uhr %02umin %02usec        ",
 		    buf, kb_per_sec / 1024,
 		    sec_remaining / 60 / 60,
 		    sec_remaining / 60 % 60,
 		    sec_remaining % 60);
 
 		zcb->zcb_lastprint = now;
 	}
 
 	return (0);
 }
 
 static void
 zdb_leak(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
 }
 
 static metaslab_ops_t zdb_metaslab_ops = {
 	NULL	/* alloc */
 };
 
 static void
 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	ddt_bookmark_t ddb = { 0 };
 	ddt_entry_t dde;
 	int error;
 
 	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
 		blkptr_t blk;
 		ddt_phys_t *ddp = dde.dde_phys;
 
 		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
 			return;
 
 		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0)
 				continue;
 			ddt_bp_create(ddb.ddb_checksum,
 			    &dde.dde_key, ddp, &blk);
 			if (p == DDT_PHYS_DITTO) {
 				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
 			} else {
 				zcb->zcb_dedup_asize +=
 				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
 				zcb->zcb_dedup_blocks++;
 			}
 		}
 		if (!dump_opt['L']) {
 			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
 			ddt_enter(ddt);
 			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
 			ddt_exit(ddt);
 		}
 	}
 
 	ASSERT(error == ENOENT);
 }
 
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	zcb->zcb_spa = spa;
 
 	if (!dump_opt['L']) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 			for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
 				mutex_enter(&msp->ms_lock);
 				metaslab_unload(msp);
 
 				/*
 				 * For leak detection, we overload the metaslab
 				 * ms_tree to contain allocated segments
 				 * instead of free segments. As a result,
 				 * we can't use the normal metaslab_load/unload
 				 * interfaces.
 				 */
 				if (msp->ms_sm != NULL) {
 					(void) fprintf(stderr,
 					    "\rloading space map for "
 					    "vdev %llu of %llu, "
 					    "metaslab %llu of %llu ...",
 					    (longlong_t)c,
 					    (longlong_t)rvd->vdev_children,
 					    (longlong_t)m,
 					    (longlong_t)vd->vdev_ms_count);
 
 					msp->ms_ops = &zdb_metaslab_ops;
 
 					/*
 					 * We don't want to spend the CPU
 					 * manipulating the size-ordered
 					 * tree, so clear the range_tree
 					 * ops.
 					 */
 					msp->ms_tree->rt_ops = NULL;
 					VERIFY0(space_map_load(msp->ms_sm,
 					    msp->ms_tree, SM_ALLOC));
 					msp->ms_loaded = B_TRUE;
 				}
 				mutex_exit(&msp->ms_lock);
 			}
 		}
 		(void) fprintf(stderr, "\n");
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	zdb_ddt_leak_init(spa, zcb);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static void
 zdb_leak_fini(spa_t *spa)
 {
 	if (!dump_opt['L']) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 			for (int m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
 				mutex_enter(&msp->ms_lock);
 
 				/*
 				 * The ms_tree has been overloaded to
 				 * contain allocated segments. Now that we
 				 * finished traversing all blocks, any
 				 * block that remains in the ms_tree
 				 * represents an allocated block that we
 				 * did not claim during the traversal.
 				 * Claimed blocks would have been removed
 				 * from the ms_tree.
 				 */
 				range_tree_vacate(msp->ms_tree, zdb_leak, vd);
 				msp->ms_loaded = B_FALSE;
 
 				mutex_exit(&msp->ms_lock);
 			}
 		}
 	}
 }
 
 /* ARGSUSED */
 static int
 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zdb_cb_t *zcb = arg;
 
 	if (dump_opt['b'] >= 5) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("[%s] %s\n",
 		    "deferred free", blkbuf);
 	}
 	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
 	return (0);
 }
 
 static int
 dump_block_stats(spa_t *spa)
 {
 	zdb_cb_t zcb = { 0 };
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
 	boolean_t leaks = B_FALSE;
 
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
 	    dump_opt['c'] ? "checksums " : "",
 	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
 	    !dump_opt['L'] ? "nothing leaked " : "");
 
 	/*
 	 * Load all space maps as SM_ALLOC maps, then traverse the pool
 	 * claiming each block we discover.  If the pool is perfectly
 	 * consistent, the space maps will be empty when we're done.
 	 * Anything left over is a leak; any block we can't claim (because
 	 * it's not part of any space map) is a double allocation,
 	 * reference to a freed block, or an unclaimed log block.
 	 */
 	zdb_leak_init(spa, &zcb);
 
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
 	    count_block_cb, &zcb, NULL);
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
 		    count_block_cb, &zcb, NULL);
 	}
 	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
 		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
 		    &zcb, NULL));
 	}
 
 	if (dump_opt['c'] > 1)
 		flags |= TRAVERSE_PREFETCH_DATA;
 
 	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
 	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
 	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
 
 	/*
 	 * If we've traversed the data blocks then we need to wait for those
 	 * I/Os to complete. We leverage "The Godfather" zio to wait on
 	 * all async I/Os to complete.
 	 */
 	if (dump_opt['c']) {
 		for (int i = 0; i < max_ncpus; i++) {
 			(void) zio_wait(spa->spa_async_zio_root[i]);
 			spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_GODFATHER);
 		}
 	}
 
 	if (zcb.zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
 		for (int e = 0; e < 256; e++) {
 			if (zcb.zcb_errors[e] != 0) {
 				(void) printf("\t%5d  %llu\n",
 				    e, (u_longlong_t)zcb.zcb_errors[e]);
 			}
 		}
 	}
 
 	/*
 	 * Report any leaked segments.
 	 */
 	zdb_leak_fini(spa);
 
 	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
 
 	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	norm_space = metaslab_class_get_space(spa_normal_class(spa));
 
 	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
 	total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
 
 	if (total_found == total_alloc) {
 		if (!dump_opt['L'])
 			(void) printf("\n\tNo leaks (block sum matches space"
 			    " maps exactly)\n");
 	} else {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
 		    (u_longlong_t)total_found,
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
 		leaks = B_TRUE;
 	}
 
 	if (tzb->zb_count == 0)
 		return (2);
 
 	(void) printf("\n");
 	(void) printf("\tbp count:      %10llu\n",
 	    (u_longlong_t)tzb->zb_count);
 	(void) printf("\tganged count:  %10llu\n",
 	    (longlong_t)tzb->zb_gangs);
 	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
 	(void) printf("\tbp physical:   %10llu      avg:"
 	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_psize,
 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_psize);
 	(void) printf("\tbp allocated:  %10llu      avg:"
 	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_asize,
 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_asize);
 	(void) printf("\tbp deduped:    %10llu    ref>1:"
 	    " %6llu   deduplication: %6.2f\n",
 	    (u_longlong_t)zcb.zcb_dedup_asize,
 	    (u_longlong_t)zcb.zcb_dedup_blocks,
 	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
 	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
 	for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
 		if (zcb.zcb_embedded_blocks[i] == 0)
 			continue;
 		(void) printf("\n");
 		(void) printf("\tadditional, non-pointer bps of type %u: "
 		    "%10llu\n",
 		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
 
 		if (dump_opt['b'] >= 3) {
 			(void) printf("\t number of (compressed) bytes:  "
 			    "number of bps\n");
 			dump_histogram(zcb.zcb_embedded_histogram[i],
 			    sizeof (zcb.zcb_embedded_histogram[i]) /
 			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
 		}
 	}
 
 	if (tzb->zb_ditto_samevdev != 0) {
 		(void) printf("\tDittoed blocks on same vdev: %llu\n",
 		    (longlong_t)tzb->zb_ditto_samevdev);
 	}
 
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
 		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
 			char csize[32], lsize[32], psize[32], asize[32];
 			char avg[32], gang[32];
 			char *typename;
 
 			if (t < DMU_OT_NUMTYPES)
 				typename = dmu_ot[t].ot_name;
 			else
 				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
 			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
 				(void) printf("%6s\t%5s\t%5s\t%5s"
 				    "\t%5s\t%5s\t%6s\t%s\n",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    typename);
 				continue;
 			}
 
 			for (l = ZB_TOTAL - 1; l >= -1; l--) {
 				level = (l == -1 ? ZB_TOTAL : l);
 				zb = &zcb.zcb_type[level][t];
 
 				if (zb->zb_asize == 0)
 					continue;
 
 				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
 					continue;
 
 				if (level == 0 && zb->zb_asize ==
 				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
 					continue;
 
 				zdb_nicenum(zb->zb_count, csize);
 				zdb_nicenum(zb->zb_lsize, lsize);
 				zdb_nicenum(zb->zb_psize, psize);
 				zdb_nicenum(zb->zb_asize, asize);
 				zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
 				zdb_nicenum(zb->zb_gangs, gang);
 
 				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 				    "\t%5.2f\t%6.2f\t",
 				    csize, lsize, psize, asize, avg,
 				    (double)zb->zb_lsize / zb->zb_psize,
 				    100.0 * zb->zb_asize / tzb->zb_asize);
 
 				if (level == ZB_TOTAL)
 					(void) printf("%s\n", typename);
 				else
 					(void) printf("    L%d %s\n",
 					    level, typename);
 
 				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
 					(void) printf("\t number of ganged "
 					    "blocks: %s\n", gang);
 				}
 
 				if (dump_opt['b'] >= 4) {
 					(void) printf("psize "
 					    "(in 512-byte sectors): "
 					    "number of blocks\n");
 					dump_histogram(zb->zb_psize_histogram,
 					    PSIZE_HISTO_SIZE, 0);
 				}
 			}
 		}
 	}
 
 	(void) printf("\n");
 
 	if (leaks)
 		return (2);
 
 	if (zcb.zcb_haderrors)
 		return (3);
 
 	return (0);
 }
 
 typedef struct zdb_ddt_entry {
 	ddt_key_t	zdde_key;
 	uint64_t	zdde_ref_blocks;
 	uint64_t	zdde_ref_lsize;
 	uint64_t	zdde_ref_psize;
 	uint64_t	zdde_ref_dsize;
 	avl_node_t	zdde_node;
 } zdb_ddt_entry_t;
 
 /* ARGSUSED */
 static int
 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	avl_tree_t *t = arg;
 	avl_index_t where;
 	zdb_ddt_entry_t *zdde, zdde_search;
 
 	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
 		(void) printf("traversing objset %llu, %llu objects, "
 		    "%lu blocks so far\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    avl_numnodes(t));
 	}
 
 	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
 	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 		return (0);
 
 	ddt_key_fill(&zdde_search.zdde_key, bp);
 
 	zdde = avl_find(t, &zdde_search, &where);
 
 	if (zdde == NULL) {
 		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
 		zdde->zdde_key = zdde_search.zdde_key;
 		avl_insert(t, zdde, where);
 	}
 
 	zdde->zdde_ref_blocks += 1;
 	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
 	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
 	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
 
 	return (0);
 }
 
 static void
 dump_simulated_ddt(spa_t *spa)
 {
 	avl_tree_t t;
 	void *cookie = NULL;
 	zdb_ddt_entry_t *zdde;
 	ddt_histogram_t ddh_total = { 0 };
 	ddt_stat_t dds_total = { 0 };
 
 	avl_create(&t, ddt_entry_compare,
 	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
 	    zdb_ddt_add_cb, &t);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
 		ddt_stat_t dds;
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);
 
 		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
 		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
 		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
 		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
 
 		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
 		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
 		dds.dds_ref_psize = zdde->zdde_ref_psize;
 		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
 
 		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
 		    &dds, 0);
 
 		umem_free(zdde, sizeof (*zdde));
 	}
 
 	avl_destroy(&t);
 
 	ddt_histogram_stat(&dds_total, &ddh_total);
 
 	(void) printf("Simulated DDT histogram:\n");
 
 	zpool_dump_ddt(&dds_total, &ddh_total);
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
 	if (dump_opt['S']) {
 		dump_simulated_ddt(spa);
 		return;
 	}
 
 	if (!dump_opt['e'] && dump_opt['C'] > 1) {
 		(void) printf("\nCached configuration:\n");
 		dump_nvlist(spa->spa_config, 8);
 	}
 
 	if (dump_opt['C'])
 		dump_config(spa);
 
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
 
 	if (dump_opt['D'])
 		dump_all_ddts(spa);
 
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
 		dump_metaslab_groups(spa);
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
 			dump_full_bpobj(&spa->spa_deferred_bpobj,
 			    "Deferred frees", 0);
 			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 				dump_full_bpobj(
 				    &spa->spa_dsl_pool->dp_free_bpobj,
 				    "Pool snapshot frees", 0);
 			}
 
 			if (spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY)) {
 				dump_bptree(spa->spa_meta_objset,
 				    spa->spa_dsl_pool->dp_bptree_obj,
 				    "Pool dataset frees");
 			}
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 
 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 			uint64_t refcount;
 
 			if (!(spa_feature_table[f].fi_flags &
 			    ZFEATURE_FLAG_PER_DATASET)) {
 				ASSERT0(dataset_feature_count[f]);
 				continue;
 			}
 			(void) feature_get_refcount(spa,
 			    &spa_feature_table[f], &refcount);
 			if (dataset_feature_count[f] != refcount) {
 				(void) printf("%s feature refcount mismatch: "
 				    "%lld datasets != %lld refcount\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)dataset_feature_count[f],
 				    (longlong_t)refcount);
 				rc = 2;
 			} else {
 				(void) printf("Verified %s feature refcount "
 				    "of %llu is correct\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)refcount);
 			}
 		}
 	}
 	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
 	if (rc == 0)
 		rc = verify_spacemap_refcounts(spa);
 
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
 	if (dump_opt['h'])
 		dump_history(spa);
 
 	if (rc != 0)
 		exit(rc);
 }
 
 #define	ZDB_FLAG_CHECKSUM	0x0001
 #define	ZDB_FLAG_DECOMPRESS	0x0002
 #define	ZDB_FLAG_BSWAP		0x0004
 #define	ZDB_FLAG_GBH		0x0008
 #define	ZDB_FLAG_INDIRECT	0x0010
 #define	ZDB_FLAG_PHYS		0x0020
 #define	ZDB_FLAG_RAW		0x0040
 #define	ZDB_FLAG_PRINT_BLKPTR	0x0080
 
 int flagbits[256];
 
 static void
 zdb_print_blkptr(blkptr_t *bp, int flags)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
 
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static void
 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
 {
 	int i;
 
 	for (i = 0; i < nbps; i++)
 		zdb_print_blkptr(&bp[i], flags);
 }
 
 static void
 zdb_dump_gbh(void *buf, int flags)
 {
 	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
 }
 
 static void
 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
 {
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array(buf, size);
 	(void) write(1, buf, size);
 }
 
 static void
 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
 {
 	uint64_t *d = (uint64_t *)buf;
 	int nwords = size / sizeof (uint64_t);
 	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
 	int i, j;
 	char *hdr, *c;
 
 
 	if (do_bswap)
 		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
 	else
 		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
 
 	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
 
 	for (i = 0; i < nwords; i += 2) {
 		(void) printf("%06llx:  %016llx  %016llx  ",
 		    (u_longlong_t)(i * sizeof (uint64_t)),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
 
 		c = (char *)&d[i];
 		for (j = 0; j < 2 * sizeof (uint64_t); j++)
 			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
 		(void) printf("\n");
 	}
 }
 
 /*
  * There are two acceptable formats:
  *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
  *	child[.child]*    - For example: 0.1.1
  *
  * The second form can be used to specify arbitrary vdevs anywhere
  * in the heirarchy.  For example, in a pool with a mirror of
  * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
  */
 static vdev_t *
 zdb_vdev_lookup(vdev_t *vdev, char *path)
 {
 	char *s, *p, *q;
 	int i;
 
 	if (vdev == NULL)
 		return (NULL);
 
 	/* First, assume the x.x.x.x format */
 	i = (int)strtoul(path, &s, 10);
 	if (s == path || (s && *s != '.' && *s != '\0'))
 		goto name;
 	if (i < 0 || i >= vdev->vdev_children)
 		return (NULL);
 
 	vdev = vdev->vdev_child[i];
 	if (*s == '\0')
 		return (vdev);
 	return (zdb_vdev_lookup(vdev, s+1));
 
 name:
 	for (i = 0; i < vdev->vdev_children; i++) {
 		vdev_t *vc = vdev->vdev_child[i];
 
 		if (vc->vdev_path == NULL) {
 			vc = zdb_vdev_lookup(vc, path);
 			if (vc == NULL)
 				continue;
 			else
 				return (vc);
 		}
 
 		p = strrchr(vc->vdev_path, '/');
 		p = p ? p + 1 : vc->vdev_path;
 		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
 
 		if (strcmp(vc->vdev_path, path) == 0)
 			return (vc);
 		if (strcmp(p, path) == 0)
 			return (vc);
 		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
 			return (vc);
 	}
 
 	return (NULL);
 }
 
 /*
  * Read a block from a pool and print it out.  The syntax of the
  * block descriptor is:
  *
  *	pool:vdev_specifier:offset:size[:flags]
  *
  *	pool           - The name of the pool you wish to read from
  *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
  *	offset         - offset, in hex, in bytes
  *	size           - Amount of data to read, in hex, in bytes
  *	flags          - A string of characters specifying options
  *		 b: Decode a blkptr at given offset within block
  *		*c: Calculate and display checksums
  *		 d: Decompress data before dumping
  *		 e: Byteswap data before dumping
  *		 g: Display data as a gang block header
  *		 i: Display as an indirect block
  *		 p: Do I/O to physical offset
  *		 r: Dump raw data to stdout
  *
  *              * = not yet implemented
  */
 static void
 zdb_read_block(char *thing, spa_t *spa)
 {
 	blkptr_t blk, *bp = &blk;
 	dva_t *dva = bp->blk_dva;
 	int flags = 0;
 	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
 	void *pbuf, *lbuf, *buf;
 	char *s, *p, *dup, *vdev, *flagstr;
 	int i, error;
 
 	dup = strdup(thing);
 	s = strtok(dup, ":");
 	vdev = s ? s : "";
 	s = strtok(NULL, ":");
 	offset = strtoull(s ? s : "", NULL, 16);
 	s = strtok(NULL, ":");
 	size = strtoull(s ? s : "", NULL, 16);
 	s = strtok(NULL, ":");
 	flagstr = s ? s : "";
 
 	s = NULL;
 	if (size == 0)
 		s = "size must not be zero";
 	if (!IS_P2ALIGNED(size, DEV_BSIZE))
 		s = "size must be a multiple of sector size";
 	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
 		s = "offset must be a multiple of sector size";
 	if (s) {
 		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
 		free(dup);
 		return;
 	}
 
 	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
 		for (i = 0; flagstr[i]; i++) {
 			int bit = flagbits[(uchar_t)flagstr[i]];
 
 			if (bit == 0) {
 				(void) printf("***Invalid flag: %c\n",
 				    flagstr[i]);
 				continue;
 			}
 			flags |= bit;
 
 			/* If it's not something with an argument, keep going */
 			if ((bit & (ZDB_FLAG_CHECKSUM |
 			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
 				continue;
 
 			p = &flagstr[i + 1];
 			if (bit == ZDB_FLAG_PRINT_BLKPTR)
 				blkptr_offset = strtoull(p, &p, 16);
 			if (*p != ':' && *p != '\0') {
 				(void) printf("***Invalid flag arg: '%s'\n", s);
 				free(dup);
 				return;
 			}
 			i += p - &flagstr[i + 1]; /* skip over the number */
 		}
 	}
 
 	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
 	if (vd == NULL) {
 		(void) printf("***Invalid vdev: %s\n", vdev);
 		free(dup);
 		return;
 	} else {
 		if (vd->vdev_path)
 			(void) fprintf(stderr, "Found vdev: %s\n",
 			    vd->vdev_path);
 		else
 			(void) fprintf(stderr, "Found vdev type: %s\n",
 			    vd->vdev_ops->vdev_op_type);
 	}
 
 	psize = size;
 	lsize = size;
 
 	pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 	BP_ZERO(bp);
 
 	DVA_SET_VDEV(&dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&dva[0], offset);
 	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
 	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
 
 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
 
 	BP_SET_LSIZE(bp, lsize);
 	BP_SET_PSIZE(bp, psize);
 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(bp, DMU_OT_NONE);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	zio = zio_root(spa, NULL, NULL, 0);
 
 	if (vd == vd->vdev_top) {
 		/*
 		 * Treat this as a normal block read.
 		 */
 		zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
 	} else {
 		/*
 		 * Treat this as a vdev child I/O.
 		 */
 		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
 		    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
 	}
 
 	error = zio_wait(zio);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (error) {
 		(void) printf("Read of %s failed, error: %d\n", thing, error);
 		goto out;
 	}
 
 	if (flags & ZDB_FLAG_DECOMPRESS) {
 		/*
 		 * We don't know how the data was compressed, so just try
 		 * every decompress function at every inflated blocksize.
 		 */
 		enum zio_compress c;
 		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 		bcopy(pbuf, pbuf2, psize);
 
 		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
 		    SPA_MAXBLOCKSIZE - psize) == 0);
 
 		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
 		    SPA_MAXBLOCKSIZE - psize) == 0);
 
 		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
 		    lsize -= SPA_MINBLOCKSIZE) {
 			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
 				if (zio_decompress_data(c, pbuf, lbuf,
 				    psize, lsize) == 0 &&
 				    zio_decompress_data(c, pbuf2, lbuf2,
 				    psize, lsize) == 0 &&
 				    bcmp(lbuf, lbuf2, lsize) == 0)
 					break;
 			}
 			if (c != ZIO_COMPRESS_FUNCTIONS)
 				break;
 			lsize -= SPA_MINBLOCKSIZE;
 		}
 
 		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
 		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
 
 		if (lsize <= psize) {
 			(void) printf("Decompress of %s failed\n", thing);
 			goto out;
 		}
 		buf = lbuf;
 		size = lsize;
 	} else {
 		buf = pbuf;
 		size = psize;
 	}
 
 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
 		zdb_print_blkptr((blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
 	else if (flags & ZDB_FLAG_RAW)
 		zdb_dump_block_raw(buf, size, flags);
 	else if (flags & ZDB_FLAG_INDIRECT)
 		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
 		    flags);
 	else if (flags & ZDB_FLAG_GBH)
 		zdb_dump_gbh(buf, flags);
 	else
 		zdb_dump_block(thing, buf, size, flags);
 
 out:
 	umem_free(pbuf, SPA_MAXBLOCKSIZE);
 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 	free(dup);
 }
 
 static boolean_t
 pool_match(nvlist_t *cfg, char *tgt)
 {
 	uint64_t v, guid = strtoull(tgt, NULL, 0);
 	char *s;
 
 	if (guid != 0) {
 		if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
 			return (v == guid);
 	} else {
 		if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
 			return (strcmp(s, tgt) == 0);
 	}
 	return (B_FALSE);
 }
 
 static char *
 find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
 {
 	nvlist_t *pools;
 	nvlist_t *match = NULL;
 	char *name = NULL;
 	char *sepp = NULL;
 	char sep;
 	int count = 0;
 	importargs_t args = { 0 };
 
 	args.paths = dirc;
 	args.path = dirv;
 	args.can_be_active = B_TRUE;
 
 	if ((sepp = strpbrk(*target, "/@")) != NULL) {
 		sep = *sepp;
 		*sepp = '\0';
 	}
 
 	pools = zpool_search_import(g_zfs, &args);
 
 	if (pools != NULL) {
 		nvpair_t *elem = NULL;
 		while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
 			verify(nvpair_value_nvlist(elem, configp) == 0);
 			if (pool_match(*configp, *target)) {
 				count++;
 				if (match != NULL) {
 					/* print previously found config */
 					if (name != NULL) {
 						(void) printf("%s\n", name);
 						dump_nvlist(match, 8);
 						name = NULL;
 					}
 					(void) printf("%s\n",
 					    nvpair_name(elem));
 					dump_nvlist(*configp, 8);
 				} else {
 					match = *configp;
 					name = nvpair_name(elem);
 				}
 			}
 		}
 	}
 	if (count > 1)
 		(void) fatal("\tMatched %d pools - use pool GUID "
 		    "instead of pool name or \n"
 		    "\tpool name part of a dataset name to select pool", count);
 
 	if (sepp)
 		*sepp = sep;
 	/*
 	 * If pool GUID was specified for pool id, replace it with pool name
 	 */
 	if (name && (strstr(*target, name) != *target)) {
 		int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
 
 		*target = umem_alloc(sz, UMEM_NOFAIL);
 		(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
 	}
 
 	*configp = name ? match : NULL;
 
 	return (name);
 }
 
 int
 main(int argc, char **argv)
 {
 	int i, c;
 	struct rlimit rl = { 1024, 1024 };
 	spa_t *spa = NULL;
 	objset_t *os = NULL;
 	int dump_all = 1;
 	int verbose = 0;
 	int error = 0;
 	char **searchdirs = NULL;
 	int nsearch = 0;
 	char *target;
 	nvlist_t *policy = NULL;
 	uint64_t max_txg = UINT64_MAX;
 	int rewind = ZPOOL_NEVER_REWIND;
 
 	(void) setrlimit(RLIMIT_NOFILE, &rl);
 	(void) enable_extended_FILE_stdio(-1, -1);
 
 	dprintf_setup(&argc, argv);
 
 	while ((c = getopt(argc, argv,
 	    "bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) {
 		switch (c) {
 		case 'b':
 		case 'c':
 		case 'd':
 		case 'h':
 		case 'i':
 		case 'l':
 		case 'm':
 		case 's':
 		case 'u':
 		case 'C':
 		case 'D':
 		case 'M':
 		case 'R':
 		case 'S':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
 		case 'A':
 		case 'F':
 		case 'L':
 		case 'X':
 		case 'e':
 		case 'P':
 			dump_opt[c]++;
 			break;
 		case 'I':
 			max_inflight = strtoull(optarg, NULL, 0);
 			if (max_inflight == 0) {
 				(void) fprintf(stderr, "maximum number "
 				    "of inflight I/Os must be greater "
 				    "than 0\n");
 				usage();
 			}
 			break;
 		case 'p':
 			if (searchdirs == NULL) {
 				searchdirs = umem_alloc(sizeof (char *),
 				    UMEM_NOFAIL);
 			} else {
 				char **tmp = umem_alloc((nsearch + 1) *
 				    sizeof (char *), UMEM_NOFAIL);
 				bcopy(searchdirs, tmp, nsearch *
 				    sizeof (char *));
 				umem_free(searchdirs,
 				    nsearch * sizeof (char *));
 				searchdirs = tmp;
 			}
 			searchdirs[nsearch++] = optarg;
 			break;
 		case 't':
 			max_txg = strtoull(optarg, NULL, 0);
 			if (max_txg < TXG_INITIAL) {
 				(void) fprintf(stderr, "incorrect txg "
 				    "specified: %s\n", optarg);
 				usage();
 			}
 			break;
 		case 'U':
 			spa_config_path = optarg;
 			break;
 		case 'v':
 			verbose++;
 			break;
 		case 'x':
 			vn_dumpdir = optarg;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	if (!dump_opt['e'] && searchdirs != NULL) {
 		(void) fprintf(stderr, "-p option requires use of -e\n");
 		usage();
 	}
 
 	/*
 	 * ZDB does not typically re-read blocks; therefore limit the ARC
 	 * to 256 MB, which can be used entirely for metadata.
 	 */
 	zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
 
 	/*
 	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
 	 * "zdb -b" uses traversal prefetch which uses async reads.
 	 * For good performance, let several of them be active at once.
 	 */
 	zfs_vdev_async_read_max_active = 10;
 
 	kernel_init(FREAD);
 	g_zfs = libzfs_init();
 	if (g_zfs == NULL)
 		fatal("Fail to initialize zfs");
 
 	if (dump_all)
 		verbose = MAX(verbose, 1);
 
 	for (c = 0; c < 256; c++) {
 		if (dump_all && !strchr("elAFLRSXP", c))
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
 	}
 
 	aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
 	zfs_recover = (dump_opt['A'] > 1);
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 2 && dump_opt['R'])
 		usage();
 	if (argc < 1) {
 		if (!dump_opt['e'] && dump_opt['C']) {
 			dump_cachefile(spa_config_path);
 			return (0);
 		}
 		usage();
 	}
 
 	if (dump_opt['l']) {
 		dump_label(argv[0]);
 		return (0);
 	}
 
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
 
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
 	    nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
 		fatal("internal error: %s", strerror(ENOMEM));
 
 	error = 0;
 	target = argv[0];
 
 	if (dump_opt['e']) {
 		nvlist_t *cfg = NULL;
 		char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
 
 		error = ENOENT;
 		if (name) {
 			if (dump_opt['C'] > 1) {
 				(void) printf("\nConfiguration for import:\n");
 				dump_nvlist(cfg, 8);
 			}
 			if (nvlist_add_nvlist(cfg,
 			    ZPOOL_REWIND_POLICY, policy) != 0) {
 				fatal("can't open '%s': %s",
 				    target, strerror(ENOMEM));
 			}
 			if ((error = spa_import(name, cfg, NULL,
 			    ZFS_IMPORT_MISSING_LOG)) != 0) {
 				error = spa_import(name, cfg, NULL,
 				    ZFS_IMPORT_VERBATIM);
 			}
 		}
 	}
 
 	if (error == 0) {
 		if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
 			error = spa_open_rewind(target, &spa, FTAG, policy,
 			    NULL);
 			if (error) {
 				/*
 				 * If we're missing the log device then
 				 * try opening the pool after clearing the
 				 * log state.
 				 */
 				mutex_enter(&spa_namespace_lock);
 				if ((spa = spa_lookup(target)) != NULL &&
 				    spa->spa_log_state == SPA_LOG_MISSING) {
 					spa->spa_log_state = SPA_LOG_CLEAR;
 					error = 0;
 				}
 				mutex_exit(&spa_namespace_lock);
 
 				if (!error) {
 					error = spa_open_rewind(target, &spa,
 					    FTAG, policy, NULL);
 				}
 			}
 		} else {
 			error = dmu_objset_own(target, DMU_OST_ANY,
 			    B_TRUE, FTAG, &os);
 		}
 	}
 	nvlist_free(policy);
 
 	if (error)
 		fatal("can't open '%s': %s", target, strerror(error));
 
 	argv++;
 	argc--;
 	if (!dump_opt['R']) {
 		if (argc > 0) {
 			zopt_objects = argc;
 			zopt_object = calloc(zopt_objects, sizeof (uint64_t));
 			for (i = 0; i < zopt_objects; i++) {
 				errno = 0;
 				zopt_object[i] = strtoull(argv[i], NULL, 0);
 				if (zopt_object[i] == 0 && errno != 0)
 					fatal("bad number %s: %s",
 					    argv[i], strerror(errno));
 			}
 		}
 		if (os != NULL) {
 			dump_dir(os);
 		} else if (zopt_objects > 0 && !dump_opt['m']) {
 			dump_dir(spa->spa_meta_objset);
 		} else {
 			dump_zpool(spa);
 		}
 	} else {
 		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
 		flagbits['c'] = ZDB_FLAG_CHECKSUM;
 		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
 		flagbits['e'] = ZDB_FLAG_BSWAP;
 		flagbits['g'] = ZDB_FLAG_GBH;
 		flagbits['i'] = ZDB_FLAG_INDIRECT;
 		flagbits['p'] = ZDB_FLAG_PHYS;
 		flagbits['r'] = ZDB_FLAG_RAW;
 
 		for (i = 0; i < argc; i++)
 			zdb_read_block(argv[i], spa);
 	}
 
 	(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
 
 	fuid_table_destroy();
 	sa_loaded = B_FALSE;
 
 	libzfs_fini(g_zfs);
 	kernel_fini();
 
 	return (0);
 }
Index: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	(revision 296518)
+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	(revision 296519)
@@ -1,7162 +1,7163 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2012 Milan Jurik. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2011-2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright (c) 2013 Steven Hartland.  All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <libgen.h>
 #include <libintl.h>
 #include <libuutil.h>
 #include <libnvpair.h>
 #include <locale.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <zone.h>
 #include <grp.h>
 #include <pwd.h>
 #include <signal.h>
 #include <sys/list.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 #include <time.h>
 #include <err.h>
 #include <jail.h>
 
 #include <libzfs.h>
 #include <libzfs_core.h>
 #include <zfs_prop.h>
 #include <zfs_deleg.h>
 #include <libuutil.h>
 #ifdef illumos
 #include <aclutils.h>
 #include <directory.h>
 #include <idmap.h>
 #endif
 
 #include "zfs_iter.h"
 #include "zfs_util.h"
 #include "zfs_comutil.h"
 
 libzfs_handle_t *g_zfs;
 
 static FILE *mnttab_file;
 static char history_str[HIS_MAX_RECORD_LEN];
 static boolean_t log_history = B_TRUE;
 
 static int zfs_do_clone(int argc, char **argv);
 static int zfs_do_create(int argc, char **argv);
 static int zfs_do_destroy(int argc, char **argv);
 static int zfs_do_get(int argc, char **argv);
 static int zfs_do_inherit(int argc, char **argv);
 static int zfs_do_list(int argc, char **argv);
 static int zfs_do_mount(int argc, char **argv);
 static int zfs_do_rename(int argc, char **argv);
 static int zfs_do_rollback(int argc, char **argv);
 static int zfs_do_set(int argc, char **argv);
 static int zfs_do_upgrade(int argc, char **argv);
 static int zfs_do_snapshot(int argc, char **argv);
 static int zfs_do_unmount(int argc, char **argv);
 static int zfs_do_share(int argc, char **argv);
 static int zfs_do_unshare(int argc, char **argv);
 static int zfs_do_send(int argc, char **argv);
 static int zfs_do_receive(int argc, char **argv);
 static int zfs_do_promote(int argc, char **argv);
 static int zfs_do_userspace(int argc, char **argv);
 static int zfs_do_allow(int argc, char **argv);
 static int zfs_do_unallow(int argc, char **argv);
 static int zfs_do_hold(int argc, char **argv);
 static int zfs_do_holds(int argc, char **argv);
 static int zfs_do_release(int argc, char **argv);
 static int zfs_do_diff(int argc, char **argv);
 static int zfs_do_jail(int argc, char **argv);
 static int zfs_do_unjail(int argc, char **argv);
 static int zfs_do_bookmark(int argc, char **argv);
 
 /*
  * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
  */
 
 #ifdef DEBUG
 const char *
 _umem_debug_init(void)
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 #endif
 
 typedef enum {
 	HELP_CLONE,
 	HELP_CREATE,
 	HELP_DESTROY,
 	HELP_GET,
 	HELP_INHERIT,
 	HELP_UPGRADE,
 	HELP_JAIL,
 	HELP_UNJAIL,
 	HELP_LIST,
 	HELP_MOUNT,
 	HELP_PROMOTE,
 	HELP_RECEIVE,
 	HELP_RENAME,
 	HELP_ROLLBACK,
 	HELP_SEND,
 	HELP_SET,
 	HELP_SHARE,
 	HELP_SNAPSHOT,
 	HELP_UNMOUNT,
 	HELP_UNSHARE,
 	HELP_ALLOW,
 	HELP_UNALLOW,
 	HELP_USERSPACE,
 	HELP_GROUPSPACE,
 	HELP_HOLD,
 	HELP_HOLDS,
 	HELP_RELEASE,
 	HELP_DIFF,
 	HELP_BOOKMARK,
 } zfs_help_t;
 
 typedef struct zfs_command {
 	const char	*name;
 	int		(*func)(int argc, char **argv);
 	zfs_help_t	usage;
 } zfs_command_t;
 
 /*
  * Master command table.  Each ZFS command has a name, associated function, and
  * usage message.  The usage messages need to be internationalized, so we have
  * to have a function to return the usage message based on a command index.
  *
  * These commands are organized according to how they are displayed in the usage
  * message.  An empty command (one with a NULL name) indicates an empty line in
  * the generic usage message.
  */
 static zfs_command_t command_table[] = {
 	{ "create",	zfs_do_create,		HELP_CREATE		},
 	{ "destroy",	zfs_do_destroy,		HELP_DESTROY		},
 	{ NULL },
 	{ "snapshot",	zfs_do_snapshot,	HELP_SNAPSHOT		},
 	{ "rollback",	zfs_do_rollback,	HELP_ROLLBACK		},
 	{ "clone",	zfs_do_clone,		HELP_CLONE		},
 	{ "promote",	zfs_do_promote,		HELP_PROMOTE		},
 	{ "rename",	zfs_do_rename,		HELP_RENAME		},
 	{ "bookmark",	zfs_do_bookmark,	HELP_BOOKMARK		},
 	{ NULL },
 	{ "list",	zfs_do_list,		HELP_LIST		},
 	{ NULL },
 	{ "set",	zfs_do_set,		HELP_SET		},
 	{ "get",	zfs_do_get,		HELP_GET		},
 	{ "inherit",	zfs_do_inherit,		HELP_INHERIT		},
 	{ "upgrade",	zfs_do_upgrade,		HELP_UPGRADE		},
 	{ "userspace",	zfs_do_userspace,	HELP_USERSPACE		},
 	{ "groupspace",	zfs_do_userspace,	HELP_GROUPSPACE		},
 	{ NULL },
 	{ "mount",	zfs_do_mount,		HELP_MOUNT		},
 	{ "unmount",	zfs_do_unmount,		HELP_UNMOUNT		},
 	{ "share",	zfs_do_share,		HELP_SHARE		},
 	{ "unshare",	zfs_do_unshare,		HELP_UNSHARE		},
 	{ NULL },
 	{ "send",	zfs_do_send,		HELP_SEND		},
 	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
 	{ NULL },
 	{ "allow",	zfs_do_allow,		HELP_ALLOW		},
 	{ NULL },
 	{ "unallow",	zfs_do_unallow,		HELP_UNALLOW		},
 	{ NULL },
 	{ "hold",	zfs_do_hold,		HELP_HOLD		},
 	{ "holds",	zfs_do_holds,		HELP_HOLDS		},
 	{ "release",	zfs_do_release,		HELP_RELEASE		},
 	{ "diff",	zfs_do_diff,		HELP_DIFF		},
 	{ NULL },
 	{ "jail",	zfs_do_jail,		HELP_JAIL		},
 	{ "unjail",	zfs_do_unjail,		HELP_UNJAIL		},
 };
 
 #define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
 
 zfs_command_t *current_command;
 
 static const char *
 get_usage(zfs_help_t idx)
 {
 	switch (idx) {
 	case HELP_CLONE:
 		return (gettext("\tclone [-p] [-o property=value] ... "
 		    "<snapshot> <filesystem|volume>\n"));
 	case HELP_CREATE:
 		return (gettext("\tcreate [-pu] [-o property=value] ... "
 		    "<filesystem>\n"
 		    "\tcreate [-ps] [-b blocksize] [-o property=value] ... "
 		    "-V <size> <volume>\n"));
 	case HELP_DESTROY:
 		return (gettext("\tdestroy [-fnpRrv] <filesystem|volume>\n"
 		    "\tdestroy [-dnpRrv] "
 		    "<filesystem|volume>@<snap>[%<snap>][,...]\n"
 		    "\tdestroy <filesystem|volume>#<bookmark>\n"));
 	case HELP_GET:
 		return (gettext("\tget [-rHp] [-d max] "
 		    "[-o \"all\" | field[,...]]\n"
 		    "\t    [-t type[,...]] [-s source[,...]]\n"
 		    "\t    <\"all\" | property[,...]> "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_INHERIT:
 		return (gettext("\tinherit [-rS] <property> "
 		    "<filesystem|volume|snapshot> ...\n"));
 	case HELP_UPGRADE:
 		return (gettext("\tupgrade [-v]\n"
 		    "\tupgrade [-r] [-V version] <-a | filesystem ...>\n"));
 	case HELP_JAIL:
 		return (gettext("\tjail <jailid|jailname> <filesystem>\n"));
 	case HELP_UNJAIL:
 		return (gettext("\tunjail <jailid|jailname> <filesystem>\n"));
 	case HELP_LIST:
 		return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] "
 		    "[-s property]...\n\t    [-S property]... [-t type[,...]] "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_MOUNT:
 		return (gettext("\tmount\n"
 		    "\tmount [-vO] [-o opts] <-a | filesystem>\n"));
 	case HELP_PROMOTE:
 		return (gettext("\tpromote <clone-filesystem>\n"));
 	case HELP_RECEIVE:
 		return (gettext("\treceive|recv [-vnsFu] <filesystem|volume|"
 		    "snapshot>\n"
 		    "\treceive|recv [-vnsFu] [-o origin=<snapshot>] [-d | -e] "
 		    "<filesystem>\n"
 		    "\treceive|recv -A <filesystem|volume>\n"));
 	case HELP_RENAME:
 		return (gettext("\trename [-f] <filesystem|volume|snapshot> "
 		    "<filesystem|volume|snapshot>\n"
 		    "\trename [-f] -p <filesystem|volume> <filesystem|volume>\n"
 		    "\trename -r <snapshot> <snapshot>\n"
 		    "\trename -u [-p] <filesystem> <filesystem>"));
 	case HELP_ROLLBACK:
 		return (gettext("\trollback [-rRf] <snapshot>\n"));
 	case HELP_SEND:
 		return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
 		    "<snapshot>\n"
 		    "\tsend [-Le] [-i snapshot|bookmark] "
 		    "<filesystem|volume|snapshot>\n"
 		    "\tsend [-nvPe] -t <receive_resume_token>\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> ... "
 		    "<filesystem|volume|snapshot> ...\n"));
 	case HELP_SHARE:
 		return (gettext("\tshare <-a | filesystem>\n"));
 	case HELP_SNAPSHOT:
 		return (gettext("\tsnapshot|snap [-r] [-o property=value] ... "
 		    "<filesystem|volume>@<snap> ...\n"));
 	case HELP_UNMOUNT:
 		return (gettext("\tunmount|umount [-f] "
 		    "<-a | filesystem|mountpoint>\n"));
 	case HELP_UNSHARE:
 		return (gettext("\tunshare "
 		    "<-a | filesystem|mountpoint>\n"));
 	case HELP_ALLOW:
 		return (gettext("\tallow <filesystem|volume>\n"
 		    "\tallow [-ldug] "
 		    "<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n"
 		    "\t    <filesystem|volume>\n"
 		    "\tallow [-ld] -e <perm|@setname>[,...] "
 		    "<filesystem|volume>\n"
 		    "\tallow -c <perm|@setname>[,...] <filesystem|volume>\n"
 		    "\tallow -s @setname <perm|@setname>[,...] "
 		    "<filesystem|volume>\n"));
 	case HELP_UNALLOW:
 		return (gettext("\tunallow [-rldug] "
 		    "<\"everyone\"|user|group>[,...]\n"
 		    "\t    [<perm|@setname>[,...]] <filesystem|volume>\n"
 		    "\tunallow [-rld] -e [<perm|@setname>[,...]] "
 		    "<filesystem|volume>\n"
 		    "\tunallow [-r] -c [<perm|@setname>[,...]] "
 		    "<filesystem|volume>\n"
 		    "\tunallow [-r] -s @setname [<perm|@setname>[,...]] "
 		    "<filesystem|volume>\n"));
 	case HELP_USERSPACE:
 		return (gettext("\tuserspace [-Hinp] [-o field[,...]] "
 		    "[-s field] ...\n"
 		    "\t    [-S field] ... [-t type[,...]] "
 		    "<filesystem|snapshot>\n"));
 	case HELP_GROUPSPACE:
 		return (gettext("\tgroupspace [-Hinp] [-o field[,...]] "
 		    "[-s field] ...\n"
 		    "\t    [-S field] ... [-t type[,...]] "
 		    "<filesystem|snapshot>\n"));
 	case HELP_HOLD:
 		return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
 	case HELP_HOLDS:
 		return (gettext("\tholds [-Hp] [-r|-d depth] "
 		    "<filesystem|volume|snapshot> ...\n"));
 	case HELP_RELEASE:
 		return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
 	case HELP_DIFF:
 		return (gettext("\tdiff [-FHt] <snapshot> "
 		    "[snapshot|filesystem]\n"));
 	case HELP_BOOKMARK:
 		return (gettext("\tbookmark <snapshot> <bookmark>\n"));
 	}
 
 	abort();
 	/* NOTREACHED */
 }
 
 void
 nomem(void)
 {
 	(void) fprintf(stderr, gettext("internal error: out of memory\n"));
 	exit(1);
 }
 
 /*
  * Utility function to guarantee malloc() success.
  */
 
 void *
 safe_malloc(size_t size)
 {
 	void *data;
 
 	if ((data = calloc(1, size)) == NULL)
 		nomem();
 
 	return (data);
 }
 
 static char *
 safe_strdup(char *str)
 {
 	char *dupstr = strdup(str);
 
 	if (dupstr == NULL)
 		nomem();
 
 	return (dupstr);
 }
 
 /*
  * Callback routine that will print out information for each of
  * the properties.
  */
 static int
 usage_prop_cb(int prop, void *cb)
 {
 	FILE *fp = cb;
 
 	(void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop));
 
 	if (zfs_prop_readonly(prop))
 		(void) fprintf(fp, " NO    ");
 	else
 		(void) fprintf(fp, "YES    ");
 
 	if (zfs_prop_inheritable(prop))
 		(void) fprintf(fp, "  YES   ");
 	else
 		(void) fprintf(fp, "   NO   ");
 
 	if (zfs_prop_values(prop) == NULL)
 		(void) fprintf(fp, "-\n");
 	else
 		(void) fprintf(fp, "%s\n", zfs_prop_values(prop));
 
 	return (ZPROP_CONT);
 }
 
 /*
  * Display usage message.  If we're inside a command, display only the usage for
  * that command.  Otherwise, iterate over the entire command table and display
  * a complete usage message.
  */
 static void
 usage(boolean_t requested)
 {
 	int i;
 	boolean_t show_properties = B_FALSE;
 	FILE *fp = requested ? stdout : stderr;
 
 	if (current_command == NULL) {
 
 		(void) fprintf(fp, gettext("usage: zfs command args ...\n"));
 		(void) fprintf(fp,
 		    gettext("where 'command' is one of the following:\n\n"));
 
 		for (i = 0; i < NCOMMAND; i++) {
 			if (command_table[i].name == NULL)
 				(void) fprintf(fp, "\n");
 			else
 				(void) fprintf(fp, "%s",
 				    get_usage(command_table[i].usage));
 		}
 
 		(void) fprintf(fp, gettext("\nEach dataset is of the form: "
 		    "pool/[dataset/]*dataset[@name]\n"));
 	} else {
 		(void) fprintf(fp, gettext("usage:\n"));
 		(void) fprintf(fp, "%s", get_usage(current_command->usage));
 	}
 
 	if (current_command != NULL &&
 	    (strcmp(current_command->name, "set") == 0 ||
 	    strcmp(current_command->name, "get") == 0 ||
 	    strcmp(current_command->name, "inherit") == 0 ||
 	    strcmp(current_command->name, "list") == 0))
 		show_properties = B_TRUE;
 
 	if (show_properties) {
 		(void) fprintf(fp,
 		    gettext("\nThe following properties are supported:\n"));
 
 		(void) fprintf(fp, "\n\t%-14s %s  %s   %s\n\n",
 		    "PROPERTY", "EDIT", "INHERIT", "VALUES");
 
 		/* Iterate over all properties */
 		(void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
 		    ZFS_TYPE_DATASET);
 
 		(void) fprintf(fp, "\t%-15s ", "userused@...");
 		(void) fprintf(fp, " NO       NO   <size>\n");
 		(void) fprintf(fp, "\t%-15s ", "groupused@...");
 		(void) fprintf(fp, " NO       NO   <size>\n");
 		(void) fprintf(fp, "\t%-15s ", "userquota@...");
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
 		(void) fprintf(fp, "\t%-15s ", "groupquota@...");
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
 		(void) fprintf(fp, "\t%-15s ", "written@<snap>");
 		(void) fprintf(fp, " NO       NO   <size>\n");
 
 		(void) fprintf(fp, gettext("\nSizes are specified in bytes "
 		    "with standard units such as K, M, G, etc.\n"));
 		(void) fprintf(fp, gettext("\nUser-defined properties can "
 		    "be specified by using a name containing a colon (:).\n"));
 		(void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ "
 		    "properties must be appended with\n"
 		    "a user or group specifier of one of these forms:\n"
 		    "    POSIX name      (eg: \"matt\")\n"
 		    "    POSIX id        (eg: \"126829\")\n"
 		    "    SMB name@domain (eg: \"matt@sun\")\n"
 		    "    SMB SID         (eg: \"S-1-234-567-89\")\n"));
 	} else {
 		(void) fprintf(fp,
 		    gettext("\nFor the property list, run: %s\n"),
 		    "zfs set|get");
 		(void) fprintf(fp,
 		    gettext("\nFor the delegated permission list, run: %s\n"),
 		    "zfs allow|unallow");
 	}
 
 	/*
 	 * See comments at end of main().
 	 */
 	if (getenv("ZFS_ABORT") != NULL) {
 		(void) printf("dumping core by request\n");
 		abort();
 	}
 
 	exit(requested ? 0 : 2);
 }
 
 /*
  * Take a property=value argument string and add it to the given nvlist.
  * Modifies the argument inplace.
  */
 static int
 parseprop(nvlist_t *props, char *propname)
 {
 	char *propval, *strval;
 
 	if ((propval = strchr(propname, '=')) == NULL) {
 		(void) fprintf(stderr, gettext("missing "
 		    "'=' for property=value argument\n"));
 		return (-1);
 	}
 	*propval = '\0';
 	propval++;
 	if (nvlist_lookup_string(props, propname, &strval) == 0) {
 		(void) fprintf(stderr, gettext("property '%s' "
 		    "specified multiple times\n"), propname);
 		return (-1);
 	}
 	if (nvlist_add_string(props, propname, propval) != 0)
 		nomem();
 	return (0);
 }
 
 static int
 parse_depth(char *opt, int *flags)
 {
 	char *tmp;
 	int depth;
 
 	depth = (int)strtol(opt, &tmp, 0);
 	if (*tmp) {
 		(void) fprintf(stderr,
 		    gettext("%s is not an integer\n"), opt);
 		usage(B_FALSE);
 	}
 	if (depth < 0) {
 		(void) fprintf(stderr,
 		    gettext("Depth can not be negative.\n"));
 		usage(B_FALSE);
 	}
 	*flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE);
 	return (depth);
 }
 
 #define	PROGRESS_DELAY 2		/* seconds */
 
 static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
 static time_t pt_begin;
 static char *pt_header = NULL;
 static boolean_t pt_shown;
 
 static void
 start_progress_timer(void)
 {
 	pt_begin = time(NULL) + PROGRESS_DELAY;
 	pt_shown = B_FALSE;
 }
 
 static void
 set_progress_header(char *header)
 {
 	assert(pt_header == NULL);
 	pt_header = safe_strdup(header);
 	if (pt_shown) {
 		(void) printf("%s: ", header);
 		(void) fflush(stdout);
 	}
 }
 
 static void
 update_progress(char *update)
 {
 	if (!pt_shown && time(NULL) > pt_begin) {
 		int len = strlen(update);
 
 		(void) printf("%s: %s%*.*s", pt_header, update, len, len,
 		    pt_reverse);
 		(void) fflush(stdout);
 		pt_shown = B_TRUE;
 	} else if (pt_shown) {
 		int len = strlen(update);
 
 		(void) printf("%s%*.*s", update, len, len, pt_reverse);
 		(void) fflush(stdout);
 	}
 }
 
 static void
 finish_progress(char *done)
 {
 	if (pt_shown) {
 		(void) printf("%s\n", done);
 		(void) fflush(stdout);
 	}
 	free(pt_header);
 	pt_header = NULL;
 }
 
 /*
  * Check if the dataset is mountable and should be automatically mounted.
  */
 static boolean_t
 should_auto_mount(zfs_handle_t *zhp)
 {
 	if (!zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, zfs_get_type(zhp)))
 		return (B_FALSE);
 	return (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON);
 }
 
 /*
  * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
  *
  * Given an existing dataset, create a writable copy whose initial contents
  * are the same as the source.  The newly created dataset maintains a
  * dependency on the original; the original cannot be destroyed so long as
  * the clone exists.
  *
  * The '-p' flag creates all the non-existing ancestors of the target first.
  */
 static int
 zfs_do_clone(int argc, char **argv)
 {
 	zfs_handle_t *zhp = NULL;
 	boolean_t parents = B_FALSE;
 	nvlist_t *props;
 	int ret = 0;
 	int c;
 
 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 
 	/* check options */
 	while ((c = getopt(argc, argv, "o:p")) != -1) {
 		switch (c) {
 		case 'o':
 			if (parseprop(props, optarg) != 0)
 				return (1);
 			break;
 		case 'p':
 			parents = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			goto usage;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing source dataset "
 		    "argument\n"));
 		goto usage;
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing target dataset "
 		    "argument\n"));
 		goto usage;
 	}
 	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		goto usage;
 	}
 
 	/* open the source dataset */
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
 		return (1);
 
 	if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM |
 	    ZFS_TYPE_VOLUME)) {
 		/*
 		 * Now create the ancestors of the target dataset.  If the
 		 * target already exists and '-p' option was used we should not
 		 * complain.
 		 */
 		if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM |
 		    ZFS_TYPE_VOLUME))
 			return (0);
 		if (zfs_create_ancestors(g_zfs, argv[1]) != 0)
 			return (1);
 	}
 
 	/* pass to libzfs */
 	ret = zfs_clone(zhp, argv[1], props);
 
 	/* create the mountpoint if necessary */
 	if (ret == 0) {
 		zfs_handle_t *clone;
 
 		clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET);
 		if (clone != NULL) {
 			/*
 			 * If the user doesn't want the dataset
 			 * automatically mounted, then skip the mount/share
 			 * step.
 			 */
 			if (should_auto_mount(clone)) {
 				if ((ret = zfs_mount(clone, NULL, 0)) != 0) {
 					(void) fprintf(stderr, gettext("clone "
 					    "successfully created, "
 					    "but not mounted\n"));
 				} else if ((ret = zfs_share(clone)) != 0) {
 					(void) fprintf(stderr, gettext("clone "
 					    "successfully created, "
 					    "but not shared\n"));
 				}
 			}
 			zfs_close(clone);
 		}
 	}
 
 	zfs_close(zhp);
 	nvlist_free(props);
 
 	return (!!ret);
 
 usage:
 	if (zhp)
 		zfs_close(zhp);
 	nvlist_free(props);
 	usage(B_FALSE);
 	return (-1);
 }
 
 /*
  * zfs create [-pu] [-o prop=value] ... fs
  * zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size
  *
  * Create a new dataset.  This command can be used to create filesystems
  * and volumes.  Snapshot creation is handled by 'zfs snapshot'.
  * For volumes, the user must specify a size to be used.
  *
  * The '-s' flag applies only to volumes, and indicates that we should not try
  * to set the reservation for this volume.  By default we set a reservation
  * equal to the size for any volume.  For pools with SPA_VERSION >=
  * SPA_VERSION_REFRESERVATION, we set a refreservation instead.
  *
  * The '-p' flag creates all the non-existing ancestors of the target first.
  *
  * The '-u' flag prevents mounting of newly created file system.
  */
 static int
 zfs_do_create(int argc, char **argv)
 {
 	zfs_type_t type = ZFS_TYPE_FILESYSTEM;
 	zfs_handle_t *zhp = NULL;
 	uint64_t volsize;
 	int c;
 	boolean_t noreserve = B_FALSE;
 	boolean_t bflag = B_FALSE;
 	boolean_t parents = B_FALSE;
 	boolean_t nomount = B_FALSE;
 	int ret = 1;
 	nvlist_t *props;
 	uint64_t intval;
 
 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":V:b:so:pu")) != -1) {
 		switch (c) {
 		case 'V':
 			type = ZFS_TYPE_VOLUME;
 			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
 				(void) fprintf(stderr, gettext("bad volume "
 				    "size '%s': %s\n"), optarg,
 				    libzfs_error_description(g_zfs));
 				goto error;
 			}
 
 			if (nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
 				nomem();
 			volsize = intval;
 			break;
 		case 'p':
 			parents = B_TRUE;
 			break;
 		case 'b':
 			bflag = B_TRUE;
 			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
 				(void) fprintf(stderr, gettext("bad volume "
 				    "block size '%s': %s\n"), optarg,
 				    libzfs_error_description(g_zfs));
 				goto error;
 			}
 
 			if (nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 			    intval) != 0)
 				nomem();
 			break;
 		case 'o':
 			if (parseprop(props, optarg) != 0)
 				goto error;
 			break;
 		case 's':
 			noreserve = B_TRUE;
 			break;
 		case 'u':
 			nomount = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing size "
 			    "argument\n"));
 			goto badusage;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			goto badusage;
 		}
 	}
 
 	if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) {
 		(void) fprintf(stderr, gettext("'-s' and '-b' can only be "
 		    "used when creating a volume\n"));
 		goto badusage;
 	}
 	if (nomount && type != ZFS_TYPE_FILESYSTEM) {
 		(void) fprintf(stderr, gettext("'-u' can only be "
 		    "used when creating a file system\n"));
 		goto badusage;
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc == 0) {
 		(void) fprintf(stderr, gettext("missing %s argument\n"),
 		    zfs_type_to_name(type));
 		goto badusage;
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		goto badusage;
 	}
 
 	if (type == ZFS_TYPE_VOLUME && !noreserve) {
 		zpool_handle_t *zpool_handle;
 		nvlist_t *real_props;
 		uint64_t spa_version;
 		char *p;
 		zfs_prop_t resv_prop;
 		char *strval;
 		char msg[1024];
 
 		if (p = strchr(argv[0], '/'))
 			*p = '\0';
 		zpool_handle = zpool_open(g_zfs, argv[0]);
 		if (p != NULL)
 			*p = '/';
 		if (zpool_handle == NULL)
 			goto error;
 		spa_version = zpool_get_prop_int(zpool_handle,
 		    ZPOOL_PROP_VERSION, NULL);
 		if (spa_version >= SPA_VERSION_REFRESERVATION)
 			resv_prop = ZFS_PROP_REFRESERVATION;
 		else
 			resv_prop = ZFS_PROP_RESERVATION;
 
 		(void) snprintf(msg, sizeof (msg),
 		    gettext("cannot create '%s'"), argv[0]);
 		if (props && (real_props = zfs_valid_proplist(g_zfs, type,
 		    props, 0, NULL, zpool_handle, msg)) == NULL) {
 			zpool_close(zpool_handle);
 			goto error;
 		}
 		zpool_close(zpool_handle);
 
 		volsize = zvol_volsize_to_reservation(volsize, real_props);
 		nvlist_free(real_props);
 
 		if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
 		    &strval) != 0) {
 			if (nvlist_add_uint64(props,
 			    zfs_prop_to_name(resv_prop), volsize) != 0) {
 				nvlist_free(props);
 				nomem();
 			}
 		}
 	}
 
 	if (parents && zfs_name_valid(argv[0], type)) {
 		/*
 		 * Now create the ancestors of target dataset.  If the target
 		 * already exists and '-p' option was used we should not
 		 * complain.
 		 */
 		if (zfs_dataset_exists(g_zfs, argv[0], type)) {
 			ret = 0;
 			goto error;
 		}
 		if (zfs_create_ancestors(g_zfs, argv[0]) != 0)
 			goto error;
 	}
 
 	/* pass to libzfs */
 	if (zfs_create(g_zfs, argv[0], type, props) != 0)
 		goto error;
 
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
 		goto error;
 
 	ret = 0;
 
 	/*
 	 * Mount and/or share the new filesystem as appropriate.  We provide a
 	 * verbose error message to let the user know that their filesystem was
 	 * in fact created, even if we failed to mount or share it.
 	 * If the user doesn't want the dataset automatically mounted,
 	 * then skip the mount/share step altogether.
 	 */
 	if (!nomount && should_auto_mount(zhp)) {
 		if (zfs_mount(zhp, NULL, 0) != 0) {
 			(void) fprintf(stderr, gettext("filesystem "
 			    "successfully created, but not mounted\n"));
 			ret = 1;
 		} else if (zfs_share(zhp) != 0) {
 			(void) fprintf(stderr, gettext("filesystem "
 			    "successfully created, but not shared\n"));
 			ret = 1;
 		}
 	}
 
 error:
 	if (zhp)
 		zfs_close(zhp);
 	nvlist_free(props);
 	return (ret);
 badusage:
 	nvlist_free(props);
 	usage(B_FALSE);
 	return (2);
 }
 
 /*
  * zfs destroy [-rRf] <fs, vol>
  * zfs destroy [-rRd] <snap>
  *
  *	-r	Recursively destroy all children
  *	-R	Recursively destroy all dependents, including clones
  *	-f	Force unmounting of any dependents
  *	-d	If we can't destroy now, mark for deferred destruction
  *
  * Destroys the given dataset.  By default, it will unmount any filesystems,
  * and refuse to destroy a dataset that has any dependents.  A dependent can
  * either be a child, or a clone of a child.
  */
 typedef struct destroy_cbdata {
 	boolean_t	cb_first;
 	boolean_t	cb_force;
 	boolean_t	cb_recurse;
 	boolean_t	cb_error;
 	boolean_t	cb_doclones;
 	zfs_handle_t	*cb_target;
 	boolean_t	cb_defer_destroy;
 	boolean_t	cb_verbose;
 	boolean_t	cb_parsable;
 	boolean_t	cb_dryrun;
 	nvlist_t	*cb_nvl;
 	nvlist_t	*cb_batchedsnaps;
 
 	/* first snap in contiguous run */
 	char		*cb_firstsnap;
 	/* previous snap in contiguous run */
 	char		*cb_prevsnap;
 	int64_t		cb_snapused;
 	char		*cb_snapspec;
 	char		*cb_bookmark;
 } destroy_cbdata_t;
 
 /*
  * Check for any dependents based on the '-r' or '-R' flags.
  */
 static int
 destroy_check_dependent(zfs_handle_t *zhp, void *data)
 {
 	destroy_cbdata_t *cbp = data;
 	const char *tname = zfs_get_name(cbp->cb_target);
 	const char *name = zfs_get_name(zhp);
 
 	if (strncmp(tname, name, strlen(tname)) == 0 &&
 	    (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) {
 		/*
 		 * This is a direct descendant, not a clone somewhere else in
 		 * the hierarchy.
 		 */
 		if (cbp->cb_recurse)
 			goto out;
 
 		if (cbp->cb_first) {
 			(void) fprintf(stderr, gettext("cannot destroy '%s': "
 			    "%s has children\n"),
 			    zfs_get_name(cbp->cb_target),
 			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
 			(void) fprintf(stderr, gettext("use '-r' to destroy "
 			    "the following datasets:\n"));
 			cbp->cb_first = B_FALSE;
 			cbp->cb_error = B_TRUE;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
 	} else {
 		/*
 		 * This is a clone.  We only want to report this if the '-r'
 		 * wasn't specified, or the target is a snapshot.
 		 */
 		if (!cbp->cb_recurse &&
 		    zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
 			goto out;
 
 		if (cbp->cb_first) {
 			(void) fprintf(stderr, gettext("cannot destroy '%s': "
 			    "%s has dependent clones\n"),
 			    zfs_get_name(cbp->cb_target),
 			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
 			(void) fprintf(stderr, gettext("use '-R' to destroy "
 			    "the following datasets:\n"));
 			cbp->cb_first = B_FALSE;
 			cbp->cb_error = B_TRUE;
 			cbp->cb_dryrun = B_TRUE;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
 	}
 
 out:
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 destroy_callback(zfs_handle_t *zhp, void *data)
 {
 	destroy_cbdata_t *cb = data;
 	const char *name = zfs_get_name(zhp);
 
 	if (cb->cb_verbose) {
 		if (cb->cb_parsable) {
 			(void) printf("destroy\t%s\n", name);
 		} else if (cb->cb_dryrun) {
 			(void) printf(gettext("would destroy %s\n"),
 			    name);
 		} else {
 			(void) printf(gettext("will destroy %s\n"),
 			    name);
 		}
 	}
 
 	/*
 	 * Ignore pools (which we've already flagged as an error before getting
 	 * here).
 	 */
 	if (strchr(zfs_get_name(zhp), '/') == NULL &&
 	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
 		zfs_close(zhp);
 		return (0);
 	}
 	if (cb->cb_dryrun) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	/*
 	 * We batch up all contiguous snapshots (even of different
 	 * filesystems) and destroy them with one ioctl.  We can't
 	 * simply do all snap deletions and then all fs deletions,
 	 * because we must delete a clone before its origin.
 	 */
 	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
 		fnvlist_add_boolean(cb->cb_batchedsnaps, name);
 	} else {
 		int error = zfs_destroy_snaps_nvl(g_zfs,
 		    cb->cb_batchedsnaps, B_FALSE);
 		fnvlist_free(cb->cb_batchedsnaps);
 		cb->cb_batchedsnaps = fnvlist_alloc();
 
 		if (error != 0 ||
 		    zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
 		    zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
 			zfs_close(zhp);
 			return (-1);
 		}
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 destroy_print_cb(zfs_handle_t *zhp, void *arg)
 {
 	destroy_cbdata_t *cb = arg;
 	const char *name = zfs_get_name(zhp);
 	int err = 0;
 
 	if (nvlist_exists(cb->cb_nvl, name)) {
 		if (cb->cb_firstsnap == NULL)
 			cb->cb_firstsnap = strdup(name);
 		if (cb->cb_prevsnap != NULL)
 			free(cb->cb_prevsnap);
 		/* this snap continues the current range */
 		cb->cb_prevsnap = strdup(name);
 		if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL)
 			nomem();
 		if (cb->cb_verbose) {
 			if (cb->cb_parsable) {
 				(void) printf("destroy\t%s\n", name);
 			} else if (cb->cb_dryrun) {
 				(void) printf(gettext("would destroy %s\n"),
 				    name);
 			} else {
 				(void) printf(gettext("will destroy %s\n"),
 				    name);
 			}
 		}
 	} else if (cb->cb_firstsnap != NULL) {
 		/* end of this range */
 		uint64_t used = 0;
 		err = lzc_snaprange_space(cb->cb_firstsnap,
 		    cb->cb_prevsnap, &used);
 		cb->cb_snapused += used;
 		free(cb->cb_firstsnap);
 		cb->cb_firstsnap = NULL;
 		free(cb->cb_prevsnap);
 		cb->cb_prevsnap = NULL;
 	}
 	zfs_close(zhp);
 	return (err);
 }
 
 static int
 destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb)
 {
 	int err = 0;
 	assert(cb->cb_firstsnap == NULL);
 	assert(cb->cb_prevsnap == NULL);
 	err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb);
 	if (cb->cb_firstsnap != NULL) {
 		uint64_t used = 0;
 		if (err == 0) {
 			err = lzc_snaprange_space(cb->cb_firstsnap,
 			    cb->cb_prevsnap, &used);
 		}
 		cb->cb_snapused += used;
 		free(cb->cb_firstsnap);
 		cb->cb_firstsnap = NULL;
 		free(cb->cb_prevsnap);
 		cb->cb_prevsnap = NULL;
 	}
 	return (err);
 }
 
 static int
 snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg)
 {
 	destroy_cbdata_t *cb = arg;
 	int err = 0;
 
 	/* Check for clones. */
 	if (!cb->cb_doclones && !cb->cb_defer_destroy) {
 		cb->cb_target = zhp;
 		cb->cb_first = B_TRUE;
 		err = zfs_iter_dependents(zhp, B_TRUE,
 		    destroy_check_dependent, cb);
 	}
 
 	if (err == 0) {
 		if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp)))
 			nomem();
 	}
 	zfs_close(zhp);
 	return (err);
 }
 
 static int
 gather_snapshots(zfs_handle_t *zhp, void *arg)
 {
 	destroy_cbdata_t *cb = arg;
 	int err = 0;
 
 	err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb);
 	if (err == ENOENT)
 		err = 0;
 	if (err != 0)
 		goto out;
 
 	if (cb->cb_verbose) {
 		err = destroy_print_snapshots(zhp, cb);
 		if (err != 0)
 			goto out;
 	}
 
 	if (cb->cb_recurse)
 		err = zfs_iter_filesystems(zhp, gather_snapshots, cb);
 
 out:
 	zfs_close(zhp);
 	return (err);
 }
 
 static int
 destroy_clones(destroy_cbdata_t *cb)
 {
 	nvpair_t *pair;
 	for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(cb->cb_nvl, pair)) {
 		zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair),
 		    ZFS_TYPE_SNAPSHOT);
 		if (zhp != NULL) {
 			boolean_t defer = cb->cb_defer_destroy;
 			int err = 0;
 
 			/*
 			 * We can't defer destroy non-snapshots, so set it to
 			 * false while destroying the clones.
 			 */
 			cb->cb_defer_destroy = B_FALSE;
 			err = zfs_iter_dependents(zhp, B_FALSE,
 			    destroy_callback, cb);
 			cb->cb_defer_destroy = defer;
 			zfs_close(zhp);
 			if (err != 0)
 				return (err);
 		}
 	}
 	return (0);
 }
 
 static int
 zfs_do_destroy(int argc, char **argv)
 {
 	destroy_cbdata_t cb = { 0 };
 	int rv = 0;
 	int err = 0;
 	int c;
 	zfs_handle_t *zhp = NULL;
 	char *at, *pound;
 	zfs_type_t type = ZFS_TYPE_DATASET;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "vpndfrR")) != -1) {
 		switch (c) {
 		case 'v':
 			cb.cb_verbose = B_TRUE;
 			break;
 		case 'p':
 			cb.cb_verbose = B_TRUE;
 			cb.cb_parsable = B_TRUE;
 			break;
 		case 'n':
 			cb.cb_dryrun = B_TRUE;
 			break;
 		case 'd':
 			cb.cb_defer_destroy = B_TRUE;
 			type = ZFS_TYPE_SNAPSHOT;
 			break;
 		case 'f':
 			cb.cb_force = B_TRUE;
 			break;
 		case 'r':
 			cb.cb_recurse = B_TRUE;
 			break;
 		case 'R':
 			cb.cb_recurse = B_TRUE;
 			cb.cb_doclones = B_TRUE;
 			break;
 		case '?':
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc == 0) {
 		(void) fprintf(stderr, gettext("missing dataset argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	at = strchr(argv[0], '@');
 	pound = strchr(argv[0], '#');
 	if (at != NULL) {
 
 		/* Build the list of snaps to destroy in cb_nvl. */
 		cb.cb_nvl = fnvlist_alloc();
 
 		*at = '\0';
 		zhp = zfs_open(g_zfs, argv[0],
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		if (zhp == NULL)
 			return (1);
 
 		cb.cb_snapspec = at + 1;
 		if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 ||
 		    cb.cb_error) {
 			rv = 1;
 			goto out;
 		}
 
 		if (nvlist_empty(cb.cb_nvl)) {
 			(void) fprintf(stderr, gettext("could not find any "
 			    "snapshots to destroy; check snapshot names.\n"));
 			rv = 1;
 			goto out;
 		}
 
 		if (cb.cb_verbose) {
 			char buf[16];
 			zfs_nicenum(cb.cb_snapused, buf, sizeof (buf));
 			if (cb.cb_parsable) {
 				(void) printf("reclaim\t%llu\n",
 				    cb.cb_snapused);
 			} else if (cb.cb_dryrun) {
 				(void) printf(gettext("would reclaim %s\n"),
 				    buf);
 			} else {
 				(void) printf(gettext("will reclaim %s\n"),
 				    buf);
 			}
 		}
 
 		if (!cb.cb_dryrun) {
 			if (cb.cb_doclones) {
 				cb.cb_batchedsnaps = fnvlist_alloc();
 				err = destroy_clones(&cb);
 				if (err == 0) {
 					err = zfs_destroy_snaps_nvl(g_zfs,
 					    cb.cb_batchedsnaps, B_FALSE);
 				}
 				if (err != 0) {
 					rv = 1;
 					goto out;
 				}
 			}
 			if (err == 0) {
 				err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
 				    cb.cb_defer_destroy);
 			}
 		}
 
 		if (err != 0)
 			rv = 1;
 	} else if (pound != NULL) {
 		int err;
 		nvlist_t *nvl;
 
 		if (cb.cb_dryrun) {
 			(void) fprintf(stderr,
 			    "dryrun is not supported with bookmark\n");
 			return (-1);
 		}
 
 		if (cb.cb_defer_destroy) {
 			(void) fprintf(stderr,
 			    "defer destroy is not supported with bookmark\n");
 			return (-1);
 		}
 
 		if (cb.cb_recurse) {
 			(void) fprintf(stderr,
 			    "recursive is not supported with bookmark\n");
 			return (-1);
 		}
 
 		if (!zfs_bookmark_exists(argv[0])) {
 			(void) fprintf(stderr, gettext("bookmark '%s' "
 			    "does not exist.\n"), argv[0]);
 			return (1);
 		}
 
 		nvl = fnvlist_alloc();
 		fnvlist_add_boolean(nvl, argv[0]);
 
 		err = lzc_destroy_bookmarks(nvl, NULL);
 		if (err != 0) {
 			(void) zfs_standard_error(g_zfs, err,
 			    "cannot destroy bookmark");
 		}
 
 		nvlist_free(cb.cb_nvl);
 
 		return (err);
 	} else {
 		/* Open the given dataset */
 		if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
 			return (1);
 
 		cb.cb_target = zhp;
 
 		/*
 		 * Perform an explicit check for pools before going any further.
 		 */
 		if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
 		    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
 			(void) fprintf(stderr, gettext("cannot destroy '%s': "
 			    "operation does not apply to pools\n"),
 			    zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use 'zfs destroy -r "
 			    "%s' to destroy all datasets in the pool\n"),
 			    zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
 			    "to destroy the pool itself\n"), zfs_get_name(zhp));
 			rv = 1;
 			goto out;
 		}
 
 		/*
 		 * Check for any dependents and/or clones.
 		 */
 		cb.cb_first = B_TRUE;
 		if (!cb.cb_doclones &&
 		    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
 		    &cb) != 0) {
 			rv = 1;
 			goto out;
 		}
 
 		if (cb.cb_error) {
 			rv = 1;
 			goto out;
 		}
 
 		cb.cb_batchedsnaps = fnvlist_alloc();
 		if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
 		    &cb) != 0) {
 			rv = 1;
 			goto out;
 		}
 
 		/*
 		 * Do the real thing.  The callback will close the
 		 * handle regardless of whether it succeeds or not.
 		 */
 		err = destroy_callback(zhp, &cb);
 		zhp = NULL;
 		if (err == 0) {
 			err = zfs_destroy_snaps_nvl(g_zfs,
 			    cb.cb_batchedsnaps, cb.cb_defer_destroy);
 		}
 		if (err != 0)
 			rv = 1;
 	}
 
 out:
 	fnvlist_free(cb.cb_batchedsnaps);
 	fnvlist_free(cb.cb_nvl);
 	if (zhp != NULL)
 		zfs_close(zhp);
 	return (rv);
 }
 
 static boolean_t
 is_recvd_column(zprop_get_cbdata_t *cbp)
 {
 	int i;
 	zfs_get_column_t col;
 
 	for (i = 0; i < ZFS_GET_NCOLS &&
 	    (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
 		if (col == GET_COL_RECVD)
 			return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
  *	< all | property[,property]... > < fs | snap | vol > ...
  *
  *	-r	recurse over any child datasets
  *	-H	scripted mode.  Headers are stripped, and fields are separated
  *		by tabs instead of spaces.
  *	-o	Set of fields to display.  One of "name,property,value,
  *		received,source". Default is "name,property,value,source".
  *		"all" is an alias for all five.
  *	-s	Set of sources to allow.  One of
  *		"local,default,inherited,received,temporary,none".  Default is
  *		all six.
  *	-p	Display values in parsable (literal) format.
  *
  *  Prints properties for the given datasets.  The user can control which
  *  columns to display as well as which property types to allow.
  */
 
 /*
  * Invoked to display the properties for a single dataset.
  */
 static int
 get_callback(zfs_handle_t *zhp, void *data)
 {
 	char buf[ZFS_MAXPROPLEN];
 	char rbuf[ZFS_MAXPROPLEN];
 	zprop_source_t sourcetype;
 	char source[ZFS_MAXNAMELEN];
 	zprop_get_cbdata_t *cbp = data;
 	nvlist_t *user_props = zfs_get_user_props(zhp);
 	zprop_list_t *pl = cbp->cb_proplist;
 	nvlist_t *propval;
 	char *strval;
 	char *sourceval;
 	boolean_t received = is_recvd_column(cbp);
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		char *recvdval = NULL;
 		/*
 		 * Skip the special fake placeholder.  This will also skip over
 		 * the name property when 'all' is specified.
 		 */
 		if (pl->pl_prop == ZFS_PROP_NAME &&
 		    pl == cbp->cb_proplist)
 			continue;
 
 		if (pl->pl_prop != ZPROP_INVAL) {
 			if (zfs_prop_get(zhp, pl->pl_prop, buf,
 			    sizeof (buf), &sourcetype, source,
 			    sizeof (source),
 			    cbp->cb_literal) != 0) {
 				if (pl->pl_all)
 					continue;
 				if (!zfs_prop_valid_for_type(pl->pl_prop,
 				    ZFS_TYPE_DATASET)) {
 					(void) fprintf(stderr,
 					    gettext("No such property '%s'\n"),
 					    zfs_prop_to_name(pl->pl_prop));
 					continue;
 				}
 				sourcetype = ZPROP_SRC_NONE;
 				(void) strlcpy(buf, "-", sizeof (buf));
 			}
 
 			if (received && (zfs_prop_get_recvd(zhp,
 			    zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
 			    cbp->cb_literal) == 0))
 				recvdval = rbuf;
 
 			zprop_print_one_property(zfs_get_name(zhp), cbp,
 			    zfs_prop_to_name(pl->pl_prop),
 			    buf, sourcetype, source, recvdval);
 		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
 			sourcetype = ZPROP_SRC_LOCAL;
 
 			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
 			    buf, sizeof (buf), cbp->cb_literal) != 0) {
 				sourcetype = ZPROP_SRC_NONE;
 				(void) strlcpy(buf, "-", sizeof (buf));
 			}
 
 			zprop_print_one_property(zfs_get_name(zhp), cbp,
 			    pl->pl_user_prop, buf, sourcetype, source, NULL);
 		} else if (zfs_prop_written(pl->pl_user_prop)) {
 			sourcetype = ZPROP_SRC_LOCAL;
 
 			if (zfs_prop_get_written(zhp, pl->pl_user_prop,
 			    buf, sizeof (buf), cbp->cb_literal) != 0) {
 				sourcetype = ZPROP_SRC_NONE;
 				(void) strlcpy(buf, "-", sizeof (buf));
 			}
 
 			zprop_print_one_property(zfs_get_name(zhp), cbp,
 			    pl->pl_user_prop, buf, sourcetype, source, NULL);
 		} else {
 			if (nvlist_lookup_nvlist(user_props,
 			    pl->pl_user_prop, &propval) != 0) {
 				if (pl->pl_all)
 					continue;
 				sourcetype = ZPROP_SRC_NONE;
 				strval = "-";
 			} else {
 				verify(nvlist_lookup_string(propval,
 				    ZPROP_VALUE, &strval) == 0);
 				verify(nvlist_lookup_string(propval,
 				    ZPROP_SOURCE, &sourceval) == 0);
 
 				if (strcmp(sourceval,
 				    zfs_get_name(zhp)) == 0) {
 					sourcetype = ZPROP_SRC_LOCAL;
 				} else if (strcmp(sourceval,
 				    ZPROP_SOURCE_VAL_RECVD) == 0) {
 					sourcetype = ZPROP_SRC_RECEIVED;
 				} else {
 					sourcetype = ZPROP_SRC_INHERITED;
 					(void) strlcpy(source,
 					    sourceval, sizeof (source));
 				}
 			}
 
 			if (received && (zfs_prop_get_recvd(zhp,
 			    pl->pl_user_prop, rbuf, sizeof (rbuf),
 			    cbp->cb_literal) == 0))
 				recvdval = rbuf;
 
 			zprop_print_one_property(zfs_get_name(zhp), cbp,
 			    pl->pl_user_prop, strval, sourcetype,
 			    source, recvdval);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zfs_do_get(int argc, char **argv)
 {
 	zprop_get_cbdata_t cb = { 0 };
 	int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
 	int types = ZFS_TYPE_DATASET;
 	char *value, *fields;
 	int ret = 0;
 	int limit = 0;
 	zprop_list_t fake_name = { 0 };
 
 	/*
 	 * Set up default columns and sources.
 	 */
 	cb.cb_sources = ZPROP_SRC_ALL;
 	cb.cb_columns[0] = GET_COL_NAME;
 	cb.cb_columns[1] = GET_COL_PROPERTY;
 	cb.cb_columns[2] = GET_COL_VALUE;
 	cb.cb_columns[3] = GET_COL_SOURCE;
 	cb.cb_type = ZFS_TYPE_DATASET;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) {
 		switch (c) {
 		case 'p':
 			cb.cb_literal = B_TRUE;
 			break;
 		case 'd':
 			limit = parse_depth(optarg, &flags);
 			break;
 		case 'r':
 			flags |= ZFS_ITER_RECURSE;
 			break;
 		case 'H':
 			cb.cb_scripted = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case 'o':
 			/*
 			 * Process the set of columns to display.  We zero out
 			 * the structure to give us a blank slate.
 			 */
 			bzero(&cb.cb_columns, sizeof (cb.cb_columns));
 			i = 0;
 			while (*optarg != '\0') {
 				static char *col_subopts[] =
 				    { "name", "property", "value", "received",
 				    "source", "all", NULL };
 
 				if (i == ZFS_GET_NCOLS) {
 					(void) fprintf(stderr, gettext("too "
 					    "many fields given to -o "
 					    "option\n"));
 					usage(B_FALSE);
 				}
 
 				switch (getsubopt(&optarg, col_subopts,
 				    &value)) {
 				case 0:
 					cb.cb_columns[i++] = GET_COL_NAME;
 					break;
 				case 1:
 					cb.cb_columns[i++] = GET_COL_PROPERTY;
 					break;
 				case 2:
 					cb.cb_columns[i++] = GET_COL_VALUE;
 					break;
 				case 3:
 					cb.cb_columns[i++] = GET_COL_RECVD;
 					flags |= ZFS_ITER_RECVD_PROPS;
 					break;
 				case 4:
 					cb.cb_columns[i++] = GET_COL_SOURCE;
 					break;
 				case 5:
 					if (i > 0) {
 						(void) fprintf(stderr,
 						    gettext("\"all\" conflicts "
 						    "with specific fields "
 						    "given to -o option\n"));
 						usage(B_FALSE);
 					}
 					cb.cb_columns[0] = GET_COL_NAME;
 					cb.cb_columns[1] = GET_COL_PROPERTY;
 					cb.cb_columns[2] = GET_COL_VALUE;
 					cb.cb_columns[3] = GET_COL_RECVD;
 					cb.cb_columns[4] = GET_COL_SOURCE;
 					flags |= ZFS_ITER_RECVD_PROPS;
 					i = ZFS_GET_NCOLS;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid column name "
 					    "'%s'\n"), suboptarg);
 					usage(B_FALSE);
 				}
 			}
 			break;
 
 		case 's':
 			cb.cb_sources = 0;
 			while (*optarg != '\0') {
 				static char *source_subopts[] = {
 					"local", "default", "inherited",
 					"received", "temporary", "none",
 					NULL };
 
 				switch (getsubopt(&optarg, source_subopts,
 				    &value)) {
 				case 0:
 					cb.cb_sources |= ZPROP_SRC_LOCAL;
 					break;
 				case 1:
 					cb.cb_sources |= ZPROP_SRC_DEFAULT;
 					break;
 				case 2:
 					cb.cb_sources |= ZPROP_SRC_INHERITED;
 					break;
 				case 3:
 					cb.cb_sources |= ZPROP_SRC_RECEIVED;
 					break;
 				case 4:
 					cb.cb_sources |= ZPROP_SRC_TEMPORARY;
 					break;
 				case 5:
 					cb.cb_sources |= ZPROP_SRC_NONE;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid source "
 					    "'%s'\n"), suboptarg);
 					usage(B_FALSE);
 				}
 			}
 			break;
 
 		case 't':
 			types = 0;
 			flags &= ~ZFS_ITER_PROP_LISTSNAPS;
 			while (*optarg != '\0') {
 				static char *type_subopts[] = { "filesystem",
 				    "volume", "snapshot", "bookmark",
 				    "all", NULL };
 
 				switch (getsubopt(&optarg, type_subopts,
 				    &value)) {
 				case 0:
 					types |= ZFS_TYPE_FILESYSTEM;
 					break;
 				case 1:
 					types |= ZFS_TYPE_VOLUME;
 					break;
 				case 2:
 					types |= ZFS_TYPE_SNAPSHOT;
 					break;
 				case 3:
 					types |= ZFS_TYPE_BOOKMARK;
 					break;
 				case 4:
 					types = ZFS_TYPE_DATASET |
 					    ZFS_TYPE_BOOKMARK;
 					break;
 
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid type '%s'\n"),
 					    suboptarg);
 					usage(B_FALSE);
 				}
 			}
 			break;
 
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing property "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 
 	fields = argv[0];
 
 	if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
 	    != 0)
 		usage(B_FALSE);
 
 	argc--;
 	argv++;
 
 	/*
 	 * As part of zfs_expand_proplist(), we keep track of the maximum column
 	 * width for each property.  For the 'NAME' (and 'SOURCE') columns, we
 	 * need to know the maximum name length.  However, the user likely did
 	 * not specify 'name' as one of the properties to fetch, so we need to
 	 * make sure we always include at least this property for
 	 * print_get_headers() to work properly.
 	 */
 	if (cb.cb_proplist != NULL) {
 		fake_name.pl_prop = ZFS_PROP_NAME;
 		fake_name.pl_width = strlen(gettext("NAME"));
 		fake_name.pl_next = cb.cb_proplist;
 		cb.cb_proplist = &fake_name;
 	}
 
 	cb.cb_first = B_TRUE;
 
 	/* run for each object */
 	ret = zfs_for_each(argc, argv, flags, types, NULL,
 	    &cb.cb_proplist, limit, get_callback, &cb);
 
 	if (cb.cb_proplist == &fake_name)
 		zprop_free_list(fake_name.pl_next);
 	else
 		zprop_free_list(cb.cb_proplist);
 
 	return (ret);
 }
 
 /*
  * inherit [-rS] <property> <fs|vol> ...
  *
  *	-r	Recurse over all children
  *	-S	Revert to received value, if any
  *
  * For each dataset specified on the command line, inherit the given property
  * from its parent.  Inheriting a property at the pool level will cause it to
  * use the default value.  The '-r' flag will recurse over all children, and is
  * useful for setting a property on a hierarchy-wide basis, regardless of any
  * local modifications for each dataset.
  */
 
 typedef struct inherit_cbdata {
 	const char *cb_propname;
 	boolean_t cb_received;
 } inherit_cbdata_t;
 
 static int
 inherit_recurse_cb(zfs_handle_t *zhp, void *data)
 {
 	inherit_cbdata_t *cb = data;
 	zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);
 
 	/*
 	 * If we're doing it recursively, then ignore properties that
 	 * are not valid for this type of dataset.
 	 */
 	if (prop != ZPROP_INVAL &&
 	    !zfs_prop_valid_for_type(prop, zfs_get_type(zhp)))
 		return (0);
 
 	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
 }
 
 static int
 inherit_cb(zfs_handle_t *zhp, void *data)
 {
 	inherit_cbdata_t *cb = data;
 
 	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
 }
 
 static int
 zfs_do_inherit(int argc, char **argv)
 {
 	int c;
 	zfs_prop_t prop;
 	inherit_cbdata_t cb = { 0 };
 	char *propname;
 	int ret = 0;
 	int flags = 0;
 	boolean_t received = B_FALSE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "rS")) != -1) {
 		switch (c) {
 		case 'r':
 			flags |= ZFS_ITER_RECURSE;
 			break;
 		case 'S':
 			received = B_TRUE;
 			break;
 		case '?':
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing property argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing dataset argument\n"));
 		usage(B_FALSE);
 	}
 
 	propname = argv[0];
 	argc--;
 	argv++;
 
 	if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
 		if (zfs_prop_readonly(prop)) {
 			(void) fprintf(stderr, gettext(
 			    "%s property is read-only\n"),
 			    propname);
 			return (1);
 		}
 		if (!zfs_prop_inheritable(prop) && !received) {
 			(void) fprintf(stderr, gettext("'%s' property cannot "
 			    "be inherited\n"), propname);
 			if (prop == ZFS_PROP_QUOTA ||
 			    prop == ZFS_PROP_RESERVATION ||
 			    prop == ZFS_PROP_REFQUOTA ||
 			    prop == ZFS_PROP_REFRESERVATION) {
 				(void) fprintf(stderr, gettext("use 'zfs set "
 				    "%s=none' to clear\n"), propname);
 				(void) fprintf(stderr, gettext("use 'zfs "
 				    "inherit -S %s' to revert to received "
 				    "value\n"), propname);
 			}
 			return (1);
 		}
 		if (received && (prop == ZFS_PROP_VOLSIZE ||
 		    prop == ZFS_PROP_VERSION)) {
 			(void) fprintf(stderr, gettext("'%s' property cannot "
 			    "be reverted to a received value\n"), propname);
 			return (1);
 		}
 	} else if (!zfs_prop_user(propname)) {
 		(void) fprintf(stderr, gettext("invalid property '%s'\n"),
 		    propname);
 		usage(B_FALSE);
 	}
 
 	cb.cb_propname = propname;
 	cb.cb_received = received;
 
 	if (flags & ZFS_ITER_RECURSE) {
 		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
 		    NULL, NULL, 0, inherit_recurse_cb, &cb);
 	} else {
 		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
 		    NULL, NULL, 0, inherit_cb, &cb);
 	}
 
 	return (ret);
 }
 
 typedef struct upgrade_cbdata {
 	uint64_t cb_numupgraded;
 	uint64_t cb_numsamegraded;
 	uint64_t cb_numfailed;
 	uint64_t cb_version;
 	boolean_t cb_newer;
 	boolean_t cb_foundone;
 	char cb_lastfs[ZFS_MAXNAMELEN];
 } upgrade_cbdata_t;
 
 static int
 same_pool(zfs_handle_t *zhp, const char *name)
 {
 	int len1 = strcspn(name, "/@");
 	const char *zhname = zfs_get_name(zhp);
 	int len2 = strcspn(zhname, "/@");
 
 	if (len1 != len2)
 		return (B_FALSE);
 	return (strncmp(name, zhname, len1) == 0);
 }
 
 static int
 upgrade_list_callback(zfs_handle_t *zhp, void *data)
 {
 	upgrade_cbdata_t *cb = data;
 	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
 
 	/* list if it's old/new */
 	if ((!cb->cb_newer && version < ZPL_VERSION) ||
 	    (cb->cb_newer && version > ZPL_VERSION)) {
 		char *str;
 		if (cb->cb_newer) {
 			str = gettext("The following filesystems are "
 			    "formatted using a newer software version and\n"
 			    "cannot be accessed on the current system.\n\n");
 		} else {
 			str = gettext("The following filesystems are "
 			    "out of date, and can be upgraded.  After being\n"
 			    "upgraded, these filesystems (and any 'zfs send' "
 			    "streams generated from\n"
 			    "subsequent snapshots) will no longer be "
 			    "accessible by older software versions.\n\n");
 		}
 
 		if (!cb->cb_foundone) {
 			(void) puts(str);
 			(void) printf(gettext("VER  FILESYSTEM\n"));
 			(void) printf(gettext("---  ------------\n"));
 			cb->cb_foundone = B_TRUE;
 		}
 
 		(void) printf("%2u   %s\n", version, zfs_get_name(zhp));
 	}
 
 	return (0);
 }
 
 static int
 upgrade_set_callback(zfs_handle_t *zhp, void *data)
 {
 	upgrade_cbdata_t *cb = data;
 	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
 	int needed_spa_version;
 	int spa_version;
 
 	if (zfs_spa_version(zhp, &spa_version) < 0)
 		return (-1);
 
 	needed_spa_version = zfs_spa_version_map(cb->cb_version);
 
 	if (needed_spa_version < 0)
 		return (-1);
 
 	if (spa_version < needed_spa_version) {
 		/* can't upgrade */
 		(void) printf(gettext("%s: can not be "
 		    "upgraded; the pool version needs to first "
 		    "be upgraded\nto version %d\n\n"),
 		    zfs_get_name(zhp), needed_spa_version);
 		cb->cb_numfailed++;
 		return (0);
 	}
 
 	/* upgrade */
 	if (version < cb->cb_version) {
 		char verstr[16];
 		(void) snprintf(verstr, sizeof (verstr),
 		    "%llu", cb->cb_version);
 		if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) {
 			/*
 			 * If they did "zfs upgrade -a", then we could
 			 * be doing ioctls to different pools.  We need
 			 * to log this history once to each pool, and bypass
 			 * the normal history logging that happens in main().
 			 */
 			(void) zpool_log_history(g_zfs, history_str);
 			log_history = B_FALSE;
 		}
 		if (zfs_prop_set(zhp, "version", verstr) == 0)
 			cb->cb_numupgraded++;
 		else
 			cb->cb_numfailed++;
 		(void) strcpy(cb->cb_lastfs, zfs_get_name(zhp));
 	} else if (version > cb->cb_version) {
 		/* can't downgrade */
 		(void) printf(gettext("%s: can not be downgraded; "
 		    "it is already at version %u\n"),
 		    zfs_get_name(zhp), version);
 		cb->cb_numfailed++;
 	} else {
 		cb->cb_numsamegraded++;
 	}
 	return (0);
 }
 
 /*
  * zfs upgrade
  * zfs upgrade -v
  * zfs upgrade [-r] [-V <version>] <-a | filesystem>
  */
 static int
 zfs_do_upgrade(int argc, char **argv)
 {
 	boolean_t all = B_FALSE;
 	boolean_t showversions = B_FALSE;
 	int ret = 0;
 	upgrade_cbdata_t cb = { 0 };
 	int c;
 	int flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "rvV:a")) != -1) {
 		switch (c) {
 		case 'r':
 			flags |= ZFS_ITER_RECURSE;
 			break;
 		case 'v':
 			showversions = B_TRUE;
 			break;
 		case 'V':
 			if (zfs_prop_string_to_index(ZFS_PROP_VERSION,
 			    optarg, &cb.cb_version) != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid version %s\n"), optarg);
 				usage(B_FALSE);
 			}
 			break;
 		case 'a':
 			all = B_TRUE;
 			break;
 		case '?':
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version))
 		usage(B_FALSE);
 	if (showversions && (flags & ZFS_ITER_RECURSE || all ||
 	    cb.cb_version || argc))
 		usage(B_FALSE);
 	if ((all || argc) && (showversions))
 		usage(B_FALSE);
 	if (all && argc)
 		usage(B_FALSE);
 
 	if (showversions) {
 		/* Show info on available versions. */
 		(void) printf(gettext("The following filesystem versions are "
 		    "supported:\n\n"));
 		(void) printf(gettext("VER  DESCRIPTION\n"));
 		(void) printf("---  -----------------------------------------"
 		    "---------------\n");
 		(void) printf(gettext(" 1   Initial ZFS filesystem version\n"));
 		(void) printf(gettext(" 2   Enhanced directory entries\n"));
 		(void) printf(gettext(" 3   Case insensitive and filesystem "
 		    "user identifier (FUID)\n"));
 		(void) printf(gettext(" 4   userquota, groupquota "
 		    "properties\n"));
 		(void) printf(gettext(" 5   System attributes\n"));
 		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases,\n"));
 		(void) printf("see the ZFS Administration Guide.\n\n");
 		ret = 0;
 	} else if (argc || all) {
 		/* Upgrade filesystems */
 		if (cb.cb_version == 0)
 			cb.cb_version = ZPL_VERSION;
 		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
 		    NULL, NULL, 0, upgrade_set_callback, &cb);
 		(void) printf(gettext("%llu filesystems upgraded\n"),
 		    cb.cb_numupgraded);
 		if (cb.cb_numsamegraded) {
 			(void) printf(gettext("%llu filesystems already at "
 			    "this version\n"),
 			    cb.cb_numsamegraded);
 		}
 		if (cb.cb_numfailed != 0)
 			ret = 1;
 	} else {
 		/* List old-version filesytems */
 		boolean_t found;
 		(void) printf(gettext("This system is currently running "
 		    "ZFS filesystem version %llu.\n\n"), ZPL_VERSION);
 
 		flags |= ZFS_ITER_RECURSE;
 		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
 		    NULL, NULL, 0, upgrade_list_callback, &cb);
 
 		found = cb.cb_foundone;
 		cb.cb_foundone = B_FALSE;
 		cb.cb_newer = B_TRUE;
 
 		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
 		    NULL, NULL, 0, upgrade_list_callback, &cb);
 
 		if (!cb.cb_foundone && !found) {
 			(void) printf(gettext("All filesystems are "
 			    "formatted with the current version.\n"));
 		}
 	}
 
 	return (ret);
 }
 
 /*
  * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
  *               [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
  * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
  *                [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
  *
  *	-H      Scripted mode; elide headers and separate columns by tabs.
  *	-i	Translate SID to POSIX ID.
  *	-n	Print numeric ID instead of user/group name.
  *	-o      Control which fields to display.
  *	-p	Use exact (parsable) numeric output.
  *	-s      Specify sort columns, descending order.
  *	-S      Specify sort columns, ascending order.
  *	-t      Control which object types to display.
  *
  *	Displays space consumed by, and quotas on, each user in the specified
  *	filesystem or snapshot.
  */
 
 /* us_field_types, us_field_hdr and us_field_names should be kept in sync */
 enum us_field_types {
 	USFIELD_TYPE,
 	USFIELD_NAME,
 	USFIELD_USED,
 	USFIELD_QUOTA
 };
 static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" };
 static char *us_field_names[] = { "type", "name", "used", "quota" };
 #define	USFIELD_LAST	(sizeof (us_field_names) / sizeof (char *))
 
 #define	USTYPE_PSX_GRP	(1 << 0)
 #define	USTYPE_PSX_USR	(1 << 1)
 #define	USTYPE_SMB_GRP	(1 << 2)
 #define	USTYPE_SMB_USR	(1 << 3)
 #define	USTYPE_ALL	\
 	(USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR)
 
 static int us_type_bits[] = {
 	USTYPE_PSX_GRP,
 	USTYPE_PSX_USR,
 	USTYPE_SMB_GRP,
 	USTYPE_SMB_USR,
 	USTYPE_ALL
 };
 static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup",
 	"smbuser", "all" };
 
 typedef struct us_node {
 	nvlist_t	*usn_nvl;
 	uu_avl_node_t	usn_avlnode;
 	uu_list_node_t	usn_listnode;
 } us_node_t;
 
 typedef struct us_cbdata {
 	nvlist_t	**cb_nvlp;
 	uu_avl_pool_t	*cb_avl_pool;
 	uu_avl_t	*cb_avl;
 	boolean_t	cb_numname;
 	boolean_t	cb_nicenum;
 	boolean_t	cb_sid2posix;
 	zfs_userquota_prop_t cb_prop;
 	zfs_sort_column_t *cb_sortcol;
 	size_t		cb_width[USFIELD_LAST];
 } us_cbdata_t;
 
 static boolean_t us_populated = B_FALSE;
 
 typedef struct {
 	zfs_sort_column_t *si_sortcol;
 	boolean_t	si_numname;
 } us_sort_info_t;
 
 static int
 us_field_index(char *field)
 {
 	int i;
 
 	for (i = 0; i < USFIELD_LAST; i++) {
 		if (strcmp(field, us_field_names[i]) == 0)
 			return (i);
 	}
 
 	return (-1);
 }
 
 static int
 us_compare(const void *larg, const void *rarg, void *unused)
 {
 	const us_node_t *l = larg;
 	const us_node_t *r = rarg;
 	us_sort_info_t *si = (us_sort_info_t *)unused;
 	zfs_sort_column_t *sortcol = si->si_sortcol;
 	boolean_t numname = si->si_numname;
 	nvlist_t *lnvl = l->usn_nvl;
 	nvlist_t *rnvl = r->usn_nvl;
 	int rc = 0;
 	boolean_t lvb, rvb;
 
 	for (; sortcol != NULL; sortcol = sortcol->sc_next) {
 		char *lvstr = "";
 		char *rvstr = "";
 		uint32_t lv32 = 0;
 		uint32_t rv32 = 0;
 		uint64_t lv64 = 0;
 		uint64_t rv64 = 0;
 		zfs_prop_t prop = sortcol->sc_prop;
 		const char *propname = NULL;
 		boolean_t reverse = sortcol->sc_reverse;
 
 		switch (prop) {
 		case ZFS_PROP_TYPE:
 			propname = "type";
 			(void) nvlist_lookup_uint32(lnvl, propname, &lv32);
 			(void) nvlist_lookup_uint32(rnvl, propname, &rv32);
 			if (rv32 != lv32)
 				rc = (rv32 < lv32) ? 1 : -1;
 			break;
 		case ZFS_PROP_NAME:
 			propname = "name";
 			if (numname) {
 				(void) nvlist_lookup_uint64(lnvl, propname,
 				    &lv64);
 				(void) nvlist_lookup_uint64(rnvl, propname,
 				    &rv64);
 				if (rv64 != lv64)
 					rc = (rv64 < lv64) ? 1 : -1;
 			} else {
 				(void) nvlist_lookup_string(lnvl, propname,
 				    &lvstr);
 				(void) nvlist_lookup_string(rnvl, propname,
 				    &rvstr);
 				rc = strcmp(lvstr, rvstr);
 			}
 			break;
 		case ZFS_PROP_USED:
 		case ZFS_PROP_QUOTA:
 			if (!us_populated)
 				break;
 			if (prop == ZFS_PROP_USED)
 				propname = "used";
 			else
 				propname = "quota";
 			(void) nvlist_lookup_uint64(lnvl, propname, &lv64);
 			(void) nvlist_lookup_uint64(rnvl, propname, &rv64);
 			if (rv64 != lv64)
 				rc = (rv64 < lv64) ? 1 : -1;
 			break;
 		}
 
 		if (rc != 0) {
 			if (rc < 0)
 				return (reverse ? 1 : -1);
 			else
 				return (reverse ? -1 : 1);
 		}
 	}
 
 	/*
 	 * If entries still seem to be the same, check if they are of the same
 	 * type (smbentity is added only if we are doing SID to POSIX ID
 	 * translation where we can have duplicate type/name combinations).
 	 */
 	if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 &&
 	    nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 &&
 	    lvb != rvb)
 		return (lvb < rvb ? -1 : 1);
 
 	return (0);
 }
 
 static inline const char *
 us_type2str(unsigned field_type)
 {
 	switch (field_type) {
 	case USTYPE_PSX_USR:
 		return ("POSIX User");
 	case USTYPE_PSX_GRP:
 		return ("POSIX Group");
 	case USTYPE_SMB_USR:
 		return ("SMB User");
 	case USTYPE_SMB_GRP:
 		return ("SMB Group");
 	default:
 		return ("Undefined");
 	}
 }
 
 static int
 userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
 {
 	us_cbdata_t *cb = (us_cbdata_t *)arg;
 	zfs_userquota_prop_t prop = cb->cb_prop;
 	char *name = NULL;
 	char *propname;
 	char sizebuf[32];
 	us_node_t *node;
 	uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
 	uu_avl_t *avl = cb->cb_avl;
 	uu_avl_index_t idx;
 	nvlist_t *props;
 	us_node_t *n;
 	zfs_sort_column_t *sortcol = cb->cb_sortcol;
 	unsigned type;
 	const char *typestr;
 	size_t namelen;
 	size_t typelen;
 	size_t sizelen;
 	int typeidx, nameidx, sizeidx;
 	us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
 	boolean_t smbentity = B_FALSE;
 
 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 	node = safe_malloc(sizeof (us_node_t));
 	uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
 	node->usn_nvl = props;
 
 	if (domain != NULL && domain[0] != '\0') {
 		/* SMB */
 		char sid[ZFS_MAXNAMELEN + 32];
 		uid_t id;
 #ifdef illumos
 		int err;
 		int flag = IDMAP_REQ_FLG_USE_CACHE;
 #endif
 
 		smbentity = B_TRUE;
 
 		(void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);
 
 		if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
 			type = USTYPE_SMB_GRP;
 #ifdef illumos
 			err = sid_to_id(sid, B_FALSE, &id);
 #endif
 		} else {
 			type = USTYPE_SMB_USR;
 #ifdef illumos
 			err = sid_to_id(sid, B_TRUE, &id);
 #endif
 		}
 
 #ifdef illumos
 		if (err == 0) {
 			rid = id;
 			if (!cb->cb_sid2posix) {
 				if (type == USTYPE_SMB_USR) {
 					(void) idmap_getwinnamebyuid(rid, flag,
 					    &name, NULL);
 				} else {
 					(void) idmap_getwinnamebygid(rid, flag,
 					    &name, NULL);
 				}
 				if (name == NULL)
 					name = sid;
 			}
 		}
 #endif
 	}
 
 	if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') {
 		/* POSIX or -i */
 		if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
 			type = USTYPE_PSX_GRP;
 			if (!cb->cb_numname) {
 				struct group *g;
 
 				if ((g = getgrgid(rid)) != NULL)
 					name = g->gr_name;
 			}
 		} else {
 			type = USTYPE_PSX_USR;
 			if (!cb->cb_numname) {
 				struct passwd *p;
 
 				if ((p = getpwuid(rid)) != NULL)
 					name = p->pw_name;
 			}
 		}
 	}
 
 	/*
 	 * Make sure that the type/name combination is unique when doing
 	 * SID to POSIX ID translation (hence changing the type from SMB to
 	 * POSIX).
 	 */
 	if (cb->cb_sid2posix &&
 	    nvlist_add_boolean_value(props, "smbentity", smbentity) != 0)
 		nomem();
 
 	/* Calculate/update width of TYPE field */
 	typestr = us_type2str(type);
 	typelen = strlen(gettext(typestr));
 	typeidx = us_field_index("type");
 	if (typelen > cb->cb_width[typeidx])
 		cb->cb_width[typeidx] = typelen;
 	if (nvlist_add_uint32(props, "type", type) != 0)
 		nomem();
 
 	/* Calculate/update width of NAME field */
 	if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) {
 		if (nvlist_add_uint64(props, "name", rid) != 0)
 			nomem();
 		namelen = snprintf(NULL, 0, "%u", rid);
 	} else {
 		if (nvlist_add_string(props, "name", name) != 0)
 			nomem();
 		namelen = strlen(name);
 	}
 	nameidx = us_field_index("name");
 	if (namelen > cb->cb_width[nameidx])
 		cb->cb_width[nameidx] = namelen;
 
 	/*
 	 * Check if this type/name combination is in the list and update it;
 	 * otherwise add new node to the list.
 	 */
 	if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) {
 		uu_avl_insert(avl, node, idx);
 	} else {
 		nvlist_free(props);
 		free(node);
 		node = n;
 		props = node->usn_nvl;
 	}
 
 	/* Calculate/update width of USED/QUOTA fields */
 	if (cb->cb_nicenum)
 		zfs_nicenum(space, sizebuf, sizeof (sizebuf));
 	else
 		(void) snprintf(sizebuf, sizeof (sizebuf), "%llu", space);
 	sizelen = strlen(sizebuf);
 	if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED) {
 		propname = "used";
 		if (!nvlist_exists(props, "quota"))
 			(void) nvlist_add_uint64(props, "quota", 0);
 	} else {
 		propname = "quota";
 		if (!nvlist_exists(props, "used"))
 			(void) nvlist_add_uint64(props, "used", 0);
 	}
 	sizeidx = us_field_index(propname);
 	if (sizelen > cb->cb_width[sizeidx])
 		cb->cb_width[sizeidx] = sizelen;
 
 	if (nvlist_add_uint64(props, propname, space) != 0)
 		nomem();
 
 	return (0);
 }
 
 static void
 print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types,
     size_t *width, us_node_t *node)
 {
 	nvlist_t *nvl = node->usn_nvl;
 	char valstr[ZFS_MAXNAMELEN];
 	boolean_t first = B_TRUE;
 	int cfield = 0;
 	int field;
 	uint32_t ustype;
 
 	/* Check type */
 	(void) nvlist_lookup_uint32(nvl, "type", &ustype);
 	if (!(ustype & types))
 		return;
 
 	while ((field = fields[cfield]) != USFIELD_LAST) {
 		nvpair_t *nvp = NULL;
 		data_type_t type;
 		uint32_t val32;
 		uint64_t val64;
 		char *strval = NULL;
 
 		while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 			if (strcmp(nvpair_name(nvp),
 			    us_field_names[field]) == 0)
 				break;
 		}
 
 		type = nvpair_type(nvp);
 		switch (type) {
 		case DATA_TYPE_UINT32:
 			(void) nvpair_value_uint32(nvp, &val32);
 			break;
 		case DATA_TYPE_UINT64:
 			(void) nvpair_value_uint64(nvp, &val64);
 			break;
 		case DATA_TYPE_STRING:
 			(void) nvpair_value_string(nvp, &strval);
 			break;
 		default:
 			(void) fprintf(stderr, "invalid data type\n");
 		}
 
 		switch (field) {
 		case USFIELD_TYPE:
 			strval = (char *)us_type2str(val32);
 			break;
 		case USFIELD_NAME:
 			if (type == DATA_TYPE_UINT64) {
 				(void) sprintf(valstr, "%llu", val64);
 				strval = valstr;
 			}
 			break;
 		case USFIELD_USED:
 		case USFIELD_QUOTA:
 			if (type == DATA_TYPE_UINT64) {
 				if (parsable) {
 					(void) sprintf(valstr, "%llu", val64);
 				} else {
 					zfs_nicenum(val64, valstr,
 					    sizeof (valstr));
 				}
 				if (field == USFIELD_QUOTA &&
 				    strcmp(valstr, "0") == 0)
 					strval = "none";
 				else
 					strval = valstr;
 			}
 			break;
 		}
 
 		if (!first) {
 			if (scripted)
 				(void) printf("\t");
 			else
 				(void) printf("  ");
 		}
 		if (scripted)
 			(void) printf("%s", strval);
 		else if (field == USFIELD_TYPE || field == USFIELD_NAME)
 			(void) printf("%-*s", width[field], strval);
 		else
 			(void) printf("%*s", width[field], strval);
 
 		first = B_FALSE;
 		cfield++;
 	}
 
 	(void) printf("\n");
 }
 
 static void
 print_us(boolean_t scripted, boolean_t parsable, int *fields, int types,
     size_t *width, boolean_t rmnode, uu_avl_t *avl)
 {
 	us_node_t *node;
 	const char *col;
 	int cfield = 0;
 	int field;
 
 	if (!scripted) {
 		boolean_t first = B_TRUE;
 
 		while ((field = fields[cfield]) != USFIELD_LAST) {
 			col = gettext(us_field_hdr[field]);
 			if (field == USFIELD_TYPE || field == USFIELD_NAME) {
 				(void) printf(first ? "%-*s" : "  %-*s",
 				    width[field], col);
 			} else {
 				(void) printf(first ? "%*s" : "  %*s",
 				    width[field], col);
 			}
 			first = B_FALSE;
 			cfield++;
 		}
 		(void) printf("\n");
 	}
 
 	for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) {
 		print_us_node(scripted, parsable, fields, types, width, node);
 		if (rmnode)
 			nvlist_free(node->usn_nvl);
 	}
 }
 
 static int
 zfs_do_userspace(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	zfs_userquota_prop_t p;
 
 	uu_avl_pool_t *avl_pool;
 	uu_avl_t *avl_tree;
 	uu_avl_walk_t *walk;
 	char *delim;
 	char deffields[] = "type,name,used,quota";
 	char *ofield = NULL;
 	char *tfield = NULL;
 	int cfield = 0;
 	int fields[256];
 	int i;
 	boolean_t scripted = B_FALSE;
 	boolean_t prtnum = B_FALSE;
 	boolean_t parsable = B_FALSE;
 	boolean_t sid2posix = B_FALSE;
 	int ret = 0;
 	int c;
 	zfs_sort_column_t *sortcol = NULL;
 	int types = USTYPE_PSX_USR | USTYPE_SMB_USR;
 	us_cbdata_t cb;
 	us_node_t *node;
 	us_node_t *rmnode;
 	uu_list_pool_t *listpool;
 	uu_list_t *list;
 	uu_avl_index_t idx = 0;
 	uu_list_index_t idx2 = 0;
 
 	if (argc < 2)
 		usage(B_FALSE);
 
 	if (strcmp(argv[0], "groupspace") == 0)
 		/* Toggle default group types */
 		types = USTYPE_PSX_GRP | USTYPE_SMB_GRP;
 
 	while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
 		switch (c) {
 		case 'n':
 			prtnum = B_TRUE;
 			break;
 		case 'H':
 			scripted = B_TRUE;
 			break;
 		case 'p':
 			parsable = B_TRUE;
 			break;
 		case 'o':
 			ofield = optarg;
 			break;
 		case 's':
 		case 'S':
 			if (zfs_add_sort_column(&sortcol, optarg,
 			    c == 's' ? B_FALSE : B_TRUE) != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid field '%s'\n"), optarg);
 				usage(B_FALSE);
 			}
 			break;
 		case 't':
 			tfield = optarg;
 			break;
 		case 'i':
 			sid2posix = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing dataset name\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	/* Use default output fields if not specified using -o */
 	if (ofield == NULL)
 		ofield = deffields;
 	do {
 		if ((delim = strchr(ofield, ',')) != NULL)
 			*delim = '\0';
 		if ((fields[cfield++] = us_field_index(ofield)) == -1) {
 			(void) fprintf(stderr, gettext("invalid type '%s' "
 			    "for -o option\n"), ofield);
 			return (-1);
 		}
 		if (delim != NULL)
 			ofield = delim + 1;
 	} while (delim != NULL);
 	fields[cfield] = USFIELD_LAST;
 
 	/* Override output types (-t option) */
 	if (tfield != NULL) {
 		types = 0;
 
 		do {
 			boolean_t found = B_FALSE;
 
 			if ((delim = strchr(tfield, ',')) != NULL)
 				*delim = '\0';
 			for (i = 0; i < sizeof (us_type_bits) / sizeof (int);
 			    i++) {
 				if (strcmp(tfield, us_type_names[i]) == 0) {
 					found = B_TRUE;
 					types |= us_type_bits[i];
 					break;
 				}
 			}
 			if (!found) {
 				(void) fprintf(stderr, gettext("invalid type "
 				    "'%s' for -t option\n"), tfield);
 				return (-1);
 			}
 			if (delim != NULL)
 				tfield = delim + 1;
 		} while (delim != NULL);
 	}
 
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
 		return (1);
 
 	if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
 	    offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL)
 		nomem();
 	if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
 		nomem();
 
 	/* Always add default sorting columns */
 	(void) zfs_add_sort_column(&sortcol, "type", B_FALSE);
 	(void) zfs_add_sort_column(&sortcol, "name", B_FALSE);
 
 	cb.cb_sortcol = sortcol;
 	cb.cb_numname = prtnum;
 	cb.cb_nicenum = !parsable;
 	cb.cb_avl_pool = avl_pool;
 	cb.cb_avl = avl_tree;
 	cb.cb_sid2posix = sid2posix;
 
 	for (i = 0; i < USFIELD_LAST; i++)
 		cb.cb_width[i] = strlen(gettext(us_field_hdr[i]));
 
 	for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
 		if (((p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA) &&
 		    !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) ||
 		    ((p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) &&
 		    !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))))
 			continue;
 		cb.cb_prop = p;
 		if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0)
 			return (ret);
 	}
 
 	/* Sort the list */
 	if ((node = uu_avl_first(avl_tree)) == NULL)
 		return (0);
 
 	us_populated = B_TRUE;
 
 	listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
 	    offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT);
 	list = uu_list_create(listpool, NULL, UU_DEFAULT);
 	uu_list_node_init(node, &node->usn_listnode, listpool);
 
 	while (node != NULL) {
 		rmnode = node;
 		node = uu_avl_next(avl_tree, node);
 		uu_avl_remove(avl_tree, rmnode);
 		if (uu_list_find(list, rmnode, NULL, &idx2) == NULL)
 			uu_list_insert(list, rmnode, idx2);
 	}
 
 	for (node = uu_list_first(list); node != NULL;
 	    node = uu_list_next(list, node)) {
 		us_sort_info_t sortinfo = { sortcol, cb.cb_numname };
 
 		if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL)
 			uu_avl_insert(avl_tree, node, idx);
 	}
 
 	uu_list_destroy(list);
 	uu_list_pool_destroy(listpool);
 
 	/* Print and free node nvlist memory */
 	print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE,
 	    cb.cb_avl);
 
 	zfs_free_sort_columns(sortcol);
 
 	/* Clean up the AVL tree */
 	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
 		nomem();
 
 	while ((node = uu_avl_walk_next(walk)) != NULL) {
 		uu_avl_remove(cb.cb_avl, node);
 		free(node);
 	}
 
 	uu_avl_walk_end(walk);
 	uu_avl_destroy(avl_tree);
 	uu_avl_pool_destroy(avl_pool);
 
 	return (ret);
 }
 
 /*
  * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] ...
  *      [-t type[,...]] [filesystem|volume|snapshot] ...
  *
  *	-H	Scripted mode; elide headers and separate columns by tabs.
  *	-p	Display values in parsable (literal) format.
  *	-r	Recurse over all children.
  *	-d	Limit recursion by depth.
  *	-o	Control which fields to display.
  *	-s	Specify sort columns, descending order.
  *	-S	Specify sort columns, ascending order.
  *	-t	Control which object types to display.
  *
  * When given no arguments, list all filesystems in the system.
  * Otherwise, list the specified datasets, optionally recursing down them if
  * '-r' is specified.
  */
 typedef struct list_cbdata {
 	boolean_t	cb_first;
 	boolean_t	cb_literal;
 	boolean_t	cb_scripted;
 	zprop_list_t	*cb_proplist;
 } list_cbdata_t;
 
 /*
  * Given a list of columns to display, output appropriate headers for each one.
  */
 static void
 print_header(list_cbdata_t *cb)
 {
 	zprop_list_t *pl = cb->cb_proplist;
 	char headerbuf[ZFS_MAXPROPLEN];
 	const char *header;
 	int i;
 	boolean_t first = B_TRUE;
 	boolean_t right_justify;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		if (!first) {
 			(void) printf("  ");
 		} else {
 			first = B_FALSE;
 		}
 
 		right_justify = B_FALSE;
 		if (pl->pl_prop != ZPROP_INVAL) {
 			header = zfs_prop_column_name(pl->pl_prop);
 			right_justify = zfs_prop_align_right(pl->pl_prop);
 		} else {
 			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
 				headerbuf[i] = toupper(pl->pl_user_prop[i]);
 			headerbuf[i] = '\0';
 			header = headerbuf;
 		}
 
 		if (pl->pl_next == NULL && !right_justify)
 			(void) printf("%s", header);
 		else if (right_justify)
 			(void) printf("%*s", pl->pl_width, header);
 		else
 			(void) printf("%-*s", pl->pl_width, header);
 	}
 
 	(void) printf("\n");
 }
 
 /*
  * Given a dataset and a list of fields, print out all the properties according
  * to the described layout.
  */
 static void
 print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb)
 {
 	zprop_list_t *pl = cb->cb_proplist;
 	boolean_t first = B_TRUE;
 	char property[ZFS_MAXPROPLEN];
 	nvlist_t *userprops = zfs_get_user_props(zhp);
 	nvlist_t *propval;
 	char *propstr;
 	boolean_t right_justify;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		if (!first) {
 			if (cb->cb_scripted)
 				(void) printf("\t");
 			else
 				(void) printf("  ");
 		} else {
 			first = B_FALSE;
 		}
 
 		if (pl->pl_prop == ZFS_PROP_NAME) {
 			(void) strlcpy(property, zfs_get_name(zhp),
 			    sizeof(property));
 			propstr = property;
 			right_justify = zfs_prop_align_right(pl->pl_prop);
 		} else if (pl->pl_prop != ZPROP_INVAL) {
 			if (zfs_prop_get(zhp, pl->pl_prop, property,
 			    sizeof (property), NULL, NULL, 0,
 			    cb->cb_literal) != 0)
 				propstr = "-";
 			else
 				propstr = property;
 			right_justify = zfs_prop_align_right(pl->pl_prop);
 		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
 			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
 			    property, sizeof (property), cb->cb_literal) != 0)
 				propstr = "-";
 			else
 				propstr = property;
 			right_justify = B_TRUE;
 		} else if (zfs_prop_written(pl->pl_user_prop)) {
 			if (zfs_prop_get_written(zhp, pl->pl_user_prop,
 			    property, sizeof (property), cb->cb_literal) != 0)
 				propstr = "-";
 			else
 				propstr = property;
 			right_justify = B_TRUE;
 		} else {
 			if (nvlist_lookup_nvlist(userprops,
 			    pl->pl_user_prop, &propval) != 0)
 				propstr = "-";
 			else
 				verify(nvlist_lookup_string(propval,
 				    ZPROP_VALUE, &propstr) == 0);
 			right_justify = B_FALSE;
 		}
 
 		/*
 		 * If this is being called in scripted mode, or if this is the
 		 * last column and it is left-justified, don't include a width
 		 * format specifier.
 		 */
 		if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
 			(void) printf("%s", propstr);
 		else if (right_justify)
 			(void) printf("%*s", pl->pl_width, propstr);
 		else
 			(void) printf("%-*s", pl->pl_width, propstr);
 	}
 
 	(void) printf("\n");
 }
 
 /*
  * Generic callback function to list a dataset or snapshot.
  */
 static int
 list_callback(zfs_handle_t *zhp, void *data)
 {
 	list_cbdata_t *cbp = data;
 
 	if (cbp->cb_first) {
 		if (!cbp->cb_scripted)
 			print_header(cbp);
 		cbp->cb_first = B_FALSE;
 	}
 
 	print_dataset(zhp, cbp);
 
 	return (0);
 }
 
 static int
 zfs_do_list(int argc, char **argv)
 {
 	int c;
 	static char default_fields[] =
 	    "name,used,available,referenced,mountpoint";
 	int types = ZFS_TYPE_DATASET;
 	boolean_t types_specified = B_FALSE;
 	char *fields = NULL;
 	list_cbdata_t cb = { 0 };
 	char *value;
 	int limit = 0;
 	int ret = 0;
 	zfs_sort_column_t *sortcol = NULL;
 	int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) {
 		switch (c) {
 		case 'o':
 			fields = optarg;
 			break;
 		case 'p':
 			cb.cb_literal = B_TRUE;
 			flags |= ZFS_ITER_LITERAL_PROPS;
 			break;
 		case 'd':
 			limit = parse_depth(optarg, &flags);
 			break;
 		case 'r':
 			flags |= ZFS_ITER_RECURSE;
 			break;
 		case 'H':
 			cb.cb_scripted = B_TRUE;
 			break;
 		case 's':
 			if (zfs_add_sort_column(&sortcol, optarg,
 			    B_FALSE) != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid property '%s'\n"), optarg);
 				usage(B_FALSE);
 			}
 			break;
 		case 'S':
 			if (zfs_add_sort_column(&sortcol, optarg,
 			    B_TRUE) != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid property '%s'\n"), optarg);
 				usage(B_FALSE);
 			}
 			break;
 		case 't':
 			types = 0;
 			types_specified = B_TRUE;
 			flags &= ~ZFS_ITER_PROP_LISTSNAPS;
 			while (*optarg != '\0') {
 				static char *type_subopts[] = { "filesystem",
 				    "volume", "snapshot", "snap", "bookmark",
 				    "all", NULL };
 
 				switch (getsubopt(&optarg, type_subopts,
 				    &value)) {
 				case 0:
 					types |= ZFS_TYPE_FILESYSTEM;
 					break;
 				case 1:
 					types |= ZFS_TYPE_VOLUME;
 					break;
 				case 2:
 				case 3:
 					types |= ZFS_TYPE_SNAPSHOT;
 					break;
 				case 4:
 					types |= ZFS_TYPE_BOOKMARK;
 					break;
 				case 5:
 					types = ZFS_TYPE_DATASET |
 					    ZFS_TYPE_BOOKMARK;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid type '%s'\n"),
 					    suboptarg);
 					usage(B_FALSE);
 				}
 			}
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (fields == NULL)
 		fields = default_fields;
 
 	/*
 	 * If we are only going to list snapshot names and sort by name,
 	 * then we can use faster version.
 	 */
 	if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol))
 		flags |= ZFS_ITER_SIMPLE;
 
 	/*
 	 * If "-o space" and no types were specified, don't display snapshots.
 	 */
 	if (strcmp(fields, "space") == 0 && types_specified == B_FALSE)
 		types &= ~ZFS_TYPE_SNAPSHOT;
 
 	/*
 	 * If the user specifies '-o all', the zprop_get_list() doesn't
 	 * normally include the name of the dataset.  For 'zfs list', we always
 	 * want this property to be first.
 	 */
 	if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
 	    != 0)
 		usage(B_FALSE);
 
 	cb.cb_first = B_TRUE;
 
 	ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
 	    limit, list_callback, &cb);
 
 	zprop_free_list(cb.cb_proplist);
 	zfs_free_sort_columns(sortcol);
 
 	if (ret == 0 && cb.cb_first && !cb.cb_scripted)
 		(void) printf(gettext("no datasets available\n"));
 
 	return (ret);
 }
 
 /*
  * zfs rename [-f] <fs | snap | vol> <fs | snap | vol>
  * zfs rename [-f] -p <fs | vol> <fs | vol>
  * zfs rename -r <snap> <snap>
  * zfs rename -u [-p] <fs> <fs>
  *
  * Renames the given dataset to another of the same type.
  *
  * The '-p' flag creates all the non-existing ancestors of the target first.
  */
 /* ARGSUSED */
 static int
 zfs_do_rename(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	renameflags_t flags = { 0 };
 	int c;
 	int ret = 0;
 	int types;
 	boolean_t parents = B_FALSE;
 	char *snapshot = NULL;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "fpru")) != -1) {
 		switch (c) {
 		case 'p':
 			parents = B_TRUE;
 			break;
 		case 'r':
 			flags.recurse = B_TRUE;
 			break;
 		case 'u':
 			flags.nounmount = B_TRUE;
 			break;
 		case 'f':
 			flags.forceunmount = B_TRUE;
 			break;
 		case '?':
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing source dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing target dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if (flags.recurse && parents) {
 		(void) fprintf(stderr, gettext("-p and -r options are mutually "
 		    "exclusive\n"));
 		usage(B_FALSE);
 	}
 
 	if (flags.recurse && strchr(argv[0], '@') == 0) {
 		(void) fprintf(stderr, gettext("source dataset for recursive "
 		    "rename must be a snapshot\n"));
 		usage(B_FALSE);
 	}
 
 	if (flags.nounmount && parents) {
 		(void) fprintf(stderr, gettext("-u and -p options are mutually "
 		    "exclusive\n"));
 		usage(B_FALSE);
 	}
 
 	if (flags.nounmount)
 		types = ZFS_TYPE_FILESYSTEM;
 	else if (parents)
 		types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
 	else
 		types = ZFS_TYPE_DATASET;
 
 	if (flags.recurse) {
 		/*
 		 * When we do recursive rename we are fine when the given
 		 * snapshot for the given dataset doesn't exist - it can
 		 * still exists below.
 		 */
 
 		snapshot = strchr(argv[0], '@');
 		assert(snapshot != NULL);
 		*snapshot = '\0';
 		snapshot++;
 	}
 
 	if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
 		return (1);
 
 	/* If we were asked and the name looks good, try to create ancestors. */
 	if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) &&
 	    zfs_create_ancestors(g_zfs, argv[1]) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
 
 	ret = (zfs_rename(zhp, snapshot, argv[1], flags) != 0);
 
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * zfs promote <fs>
  *
  * Promotes the given clone fs to be the parent
  */
 /* ARGSUSED */
 static int
 zfs_do_promote(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	int ret = 0;
 
 	/* check options */
 	if (argc > 1 && argv[1][0] == '-') {
 		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 		    argv[1][1]);
 		usage(B_FALSE);
 	}
 
 	/* check number of arguments */
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing clone filesystem"
 		    " argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return (1);
 
 	ret = (zfs_promote(zhp) != 0);
 
 
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * zfs rollback [-rRf] <snapshot>
  *
  *	-r	Delete any intervening snapshots before doing rollback
  *	-R	Delete any snapshots and their clones
  *	-f	ignored for backwards compatability
  *
  * Given a filesystem, rollback to a specific snapshot, discarding any changes
  * since then and making it the active dataset.  If more recent snapshots exist,
  * the command will complain unless the '-r' flag is given.
  */
 typedef struct rollback_cbdata {
 	uint64_t	cb_create;
 	boolean_t	cb_first;
 	int		cb_doclones;
 	char		*cb_target;
 	int		cb_error;
 	boolean_t	cb_recurse;
 } rollback_cbdata_t;
 
 static int
 rollback_check_dependent(zfs_handle_t *zhp, void *data)
 {
 	rollback_cbdata_t *cbp = data;
 
 	if (cbp->cb_first && cbp->cb_recurse) {
 		(void) fprintf(stderr, gettext("cannot rollback to "
 		    "'%s': clones of previous snapshots exist\n"),
 		    cbp->cb_target);
 		(void) fprintf(stderr, gettext("use '-R' to "
 		    "force deletion of the following clones and "
 		    "dependents:\n"));
 		cbp->cb_first = 0;
 		cbp->cb_error = 1;
 	}
 
 	(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
 
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Report any snapshots more recent than the one specified.  Used when '-r' is
  * not specified.  We reuse this same callback for the snapshot dependents - if
  * 'cb_dependent' is set, then this is a dependent and we should report it
  * without checking the transaction group.
  */
 static int
 rollback_check(zfs_handle_t *zhp, void *data)
 {
 	rollback_cbdata_t *cbp = data;
 
 	if (cbp->cb_doclones) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
 		if (cbp->cb_first && !cbp->cb_recurse) {
 			(void) fprintf(stderr, gettext("cannot "
 			    "rollback to '%s': more recent snapshots "
 			    "or bookmarks exist\n"),
 			    cbp->cb_target);
 			(void) fprintf(stderr, gettext("use '-r' to "
 			    "force deletion of the following "
 			    "snapshots and bookmarks:\n"));
 			cbp->cb_first = 0;
 			cbp->cb_error = 1;
 		}
 
 		if (cbp->cb_recurse) {
 			if (zfs_iter_dependents(zhp, B_TRUE,
 			    rollback_check_dependent, cbp) != 0) {
 				zfs_close(zhp);
 				return (-1);
 			}
 		} else {
 			(void) fprintf(stderr, "%s\n",
 			    zfs_get_name(zhp));
 		}
 	}
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 zfs_do_rollback(int argc, char **argv)
 {
 	int ret = 0;
 	int c;
 	boolean_t force = B_FALSE;
 	rollback_cbdata_t cb = { 0 };
 	zfs_handle_t *zhp, *snap;
 	char parentname[ZFS_MAXNAMELEN];
 	char *delim;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "rRf")) != -1) {
 		switch (c) {
 		case 'r':
 			cb.cb_recurse = 1;
 			break;
 		case 'R':
 			cb.cb_recurse = 1;
 			cb.cb_doclones = 1;
 			break;
 		case 'f':
 			force = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing dataset argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	/* open the snapshot */
 	if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
 		return (1);
 
 	/* open the parent dataset */
 	(void) strlcpy(parentname, argv[0], sizeof (parentname));
 	verify((delim = strrchr(parentname, '@')) != NULL);
 	*delim = '\0';
 	if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) {
 		zfs_close(snap);
 		return (1);
 	}
 
 	/*
 	 * Check for more recent snapshots and/or clones based on the presence
 	 * of '-r' and '-R'.
 	 */
 	cb.cb_target = argv[0];
 	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
 	cb.cb_first = B_TRUE;
 	cb.cb_error = 0;
 	if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb)) != 0)
 		goto out;
 	if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0)
 		goto out;
 
 	if ((ret = cb.cb_error) != 0)
 		goto out;
 
 	/*
 	 * Rollback parent to the given snapshot.
 	 */
 	ret = zfs_rollback(zhp, snap, force);
 
 out:
 	zfs_close(snap);
 	zfs_close(zhp);
 
 	if (ret == 0)
 		return (0);
 	else
 		return (1);
 }
 
 /*
  * zfs set property=value ... { fs | snap | vol } ...
  *
  * Sets the given properties for all datasets specified on the command line.
  */
 
 static int
 set_callback(zfs_handle_t *zhp, void *data)
 {
 	nvlist_t *props = data;
 
 	if (zfs_prop_set_list(zhp, props) != 0) {
 		switch (libzfs_errno(g_zfs)) {
 		case EZFS_MOUNTFAILED:
 			(void) fprintf(stderr, gettext("property may be set "
 			    "but unable to remount filesystem\n"));
 			break;
 		case EZFS_SHARENFSFAILED:
 			(void) fprintf(stderr, gettext("property may be set "
 			    "but unable to reshare filesystem\n"));
 			break;
 		}
 		return (1);
 	}
 	return (0);
 }
 
 static int
 zfs_do_set(int argc, char **argv)
 {
 	nvlist_t *props = NULL;
 	int ds_start = -1; /* argv idx of first dataset arg */
 	int ret = 0;
 
 	/* check for options */
 	if (argc > 1 && argv[1][0] == '-') {
 		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 		    argv[1][1]);
 		usage(B_FALSE);
 	}
 
 	/* check number of arguments */
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing arguments\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 3) {
 		if (strchr(argv[1], '=') == NULL) {
 			(void) fprintf(stderr, gettext("missing property=value "
 			    "argument(s)\n"));
 		} else {
 			(void) fprintf(stderr, gettext("missing dataset "
 			    "name(s)\n"));
 		}
 		usage(B_FALSE);
 	}
 
 	/* validate argument order:  prop=val args followed by dataset args */
 	for (int i = 1; i < argc; i++) {
 		if (strchr(argv[i], '=') != NULL) {
 			if (ds_start > 0) {
 				/* out-of-order prop=val argument */
 				(void) fprintf(stderr, gettext("invalid "
 				    "argument order\n"), i);
 				usage(B_FALSE);
 			}
 		} else if (ds_start < 0) {
 			ds_start = i;
 		}
 	}
 	if (ds_start < 0) {
 		(void) fprintf(stderr, gettext("missing dataset name(s)\n"));
 		usage(B_FALSE);
 	}
 
 	/* Populate a list of property settings */
 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 	for (int i = 1; i < ds_start; i++) {
 		if ((ret = parseprop(props, argv[i])) != 0)
 			goto error;
 	}
 
 	ret = zfs_for_each(argc - ds_start, argv + ds_start, 0,
 	    ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props);
 
 error:
 	nvlist_free(props);
 	return (ret);
 }
 
 typedef struct snap_cbdata {
 	nvlist_t *sd_nvl;
 	boolean_t sd_recursive;
 	const char *sd_snapname;
 } snap_cbdata_t;
 
 static int
 zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
 {
 	snap_cbdata_t *sd = arg;
 	char *name;
 	int rv = 0;
 	int error;
 
 	if (sd->sd_recursive &&
 	    zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
 	if (error == -1)
 		nomem();
 	fnvlist_add_boolean(sd->sd_nvl, name);
 	free(name);
 
 	if (sd->sd_recursive)
 		rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
 	zfs_close(zhp);
 	return (rv);
 }
 
 /*
  * zfs snapshot [-r] [-o prop=value] ... <fs@snap>
  *
  * Creates a snapshot with the given name.  While functionally equivalent to
  * 'zfs create', it is a separate command to differentiate intent.
  */
 static int
 zfs_do_snapshot(int argc, char **argv)
 {
 	int ret = 0;
 	int c;
 	nvlist_t *props;
 	snap_cbdata_t sd = { 0 };
 	boolean_t multiple_snaps = B_FALSE;
 
 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 	if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 
 	/* check options */
 	while ((c = getopt(argc, argv, "ro:")) != -1) {
 		switch (c) {
 		case 'o':
 			if (parseprop(props, optarg) != 0)
 				return (1);
 			break;
 		case 'r':
 			sd.sd_recursive = B_TRUE;
 			multiple_snaps = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			goto usage;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		goto usage;
 	}
 
 	if (argc > 1)
 		multiple_snaps = B_TRUE;
 	for (; argc > 0; argc--, argv++) {
 		char *atp;
 		zfs_handle_t *zhp;
 
 		atp = strchr(argv[0], '@');
 		if (atp == NULL)
 			goto usage;
 		*atp = '\0';
 		sd.sd_snapname = atp + 1;
 		zhp = zfs_open(g_zfs, argv[0],
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		if (zhp == NULL)
 			goto usage;
 		if (zfs_snapshot_cb(zhp, &sd) != 0)
 			goto usage;
 	}
 
 	ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props);
 	nvlist_free(sd.sd_nvl);
 	nvlist_free(props);
 	if (ret != 0 && multiple_snaps)
 		(void) fprintf(stderr, gettext("no snapshots were created\n"));
 	return (ret != 0);
 
 usage:
 	nvlist_free(sd.sd_nvl);
 	nvlist_free(props);
 	usage(B_FALSE);
 	return (-1);
 }
 
 /*
  * Send a backup stream to stdout.
  */
 static int
 zfs_do_send(int argc, char **argv)
 {
 	char *fromname = NULL;
 	char *toname = NULL;
 	char *resume_token = NULL;
 	char *cp;
 	zfs_handle_t *zhp;
 	sendflags_t flags = { 0 };
 	int c, err;
 	nvlist_t *dbgnv = NULL;
 	boolean_t extraverbose = B_FALSE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) {
 		switch (c) {
 		case 'i':
 			if (fromname)
 				usage(B_FALSE);
 			fromname = optarg;
 			break;
 		case 'I':
 			if (fromname)
 				usage(B_FALSE);
 			fromname = optarg;
 			flags.doall = B_TRUE;
 			break;
 		case 'R':
 			flags.replicate = B_TRUE;
 			break;
 		case 'p':
 			flags.props = B_TRUE;
 			break;
 		case 'P':
 			flags.parsable = B_TRUE;
 			flags.verbose = B_TRUE;
 			break;
 		case 'v':
 			if (flags.verbose)
 				extraverbose = B_TRUE;
 			flags.verbose = B_TRUE;
 			flags.progress = B_TRUE;
 			break;
 		case 'D':
 			flags.dedup = B_TRUE;
 			break;
 		case 'n':
 			flags.dryrun = B_TRUE;
 			break;
 		case 'L':
 			flags.largeblock = B_TRUE;
 			break;
 		case 'e':
 			flags.embed_data = B_TRUE;
 			break;
 		case 't':
 			resume_token = optarg;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (resume_token != NULL) {
 		if (fromname != NULL || flags.replicate || flags.props ||
 		    flags.dedup) {
 			(void) fprintf(stderr,
 			    gettext("invalid flags combined with -t\n"));
 			usage(B_FALSE);
 		}
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("no additional "
 			    "arguments are permitted with -t\n"));
 			usage(B_FALSE);
 		}
 	} else {
 		if (argc < 1) {
 			(void) fprintf(stderr,
 			    gettext("missing snapshot argument\n"));
 			usage(B_FALSE);
 		}
 		if (argc > 1) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 	}
 
 	if (!flags.dryrun && isatty(STDOUT_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Stream can not be written to a terminal.\n"
 		    "You must redirect standard output.\n"));
 		return (1);
 	}
 
 	if (resume_token != NULL) {
 		return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
 		    resume_token));
 	}
 
 	/*
 	 * Special case sending a filesystem, or from a bookmark.
 	 */
 	if (strchr(argv[0], '@') == NULL ||
 	    (fromname && strchr(fromname, '#') != NULL)) {
 		char frombuf[ZFS_MAXNAMELEN];
 		enum lzc_send_flags lzc_flags = 0;
 
 		if (flags.replicate || flags.doall || flags.props ||
 		    flags.dedup || flags.dryrun || flags.verbose ||
 		    flags.progress) {
 			(void) fprintf(stderr,
 			    gettext("Error: "
 			    "Unsupported flag with filesystem or bookmark.\n"));
 			return (1);
 		}
 
 		zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
 		if (zhp == NULL)
 			return (1);
 
 		if (flags.largeblock)
 			lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 		if (flags.embed_data)
 			lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 
 		if (fromname != NULL &&
 		    (fromname[0] == '#' || fromname[0] == '@')) {
 			/*
 			 * Incremental source name begins with # or @.
 			 * Default to same fs as target.
 			 */
 			(void) strncpy(frombuf, argv[0], sizeof (frombuf));
 			cp = strchr(frombuf, '@');
 			if (cp != NULL)
 				*cp = '\0';
 			(void) strlcat(frombuf, fromname, sizeof (frombuf));
 			fromname = frombuf;
 		}
 		err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags);
 		zfs_close(zhp);
 		return (err != 0);
 	}
 
 	cp = strchr(argv[0], '@');
 	*cp = '\0';
 	toname = cp + 1;
 	zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return (1);
 
 	/*
 	 * If they specified the full path to the snapshot, chop off
 	 * everything except the short name of the snapshot, but special
 	 * case if they specify the origin.
 	 */
 	if (fromname && (cp = strchr(fromname, '@')) != NULL) {
 		char origin[ZFS_MAXNAMELEN];
 		zprop_source_t src;
 
 		(void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
 		    origin, sizeof (origin), &src, NULL, 0, B_FALSE);
 
 		if (strcmp(origin, fromname) == 0) {
 			fromname = NULL;
 			flags.fromorigin = B_TRUE;
 		} else {
 			*cp = '\0';
 			if (cp != fromname && strcmp(argv[0], fromname)) {
 				(void) fprintf(stderr,
 				    gettext("incremental source must be "
 				    "in same filesystem\n"));
 				usage(B_FALSE);
 			}
 			fromname = cp + 1;
 			if (strchr(fromname, '@') || strchr(fromname, '/')) {
 				(void) fprintf(stderr,
 				    gettext("invalid incremental source\n"));
 				usage(B_FALSE);
 			}
 		}
 	}
 
 	if (flags.replicate && fromname == NULL)
 		flags.doall = B_TRUE;
 
 	err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0,
 	    extraverbose ? &dbgnv : NULL);
 
 	if (extraverbose && dbgnv != NULL) {
 		/*
 		 * dump_nvlist prints to stdout, but that's been
 		 * redirected to a file.  Make it print to stderr
 		 * instead.
 		 */
 		(void) dup2(STDERR_FILENO, STDOUT_FILENO);
 		dump_nvlist(dbgnv, 0);
 		nvlist_free(dbgnv);
 	}
 	zfs_close(zhp);
 
 	return (err != 0);
 }
 
 /*
  * Restore a backup stream from stdin.
  */
 static int
 zfs_do_receive(int argc, char **argv)
 {
 	int c, err;
 	recvflags_t flags = { 0 };
 	boolean_t abort_resumable = B_FALSE;
 
 	nvlist_t *props;
 	nvpair_t *nvp = NULL;
 
 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) {
 		switch (c) {
 		case 'o':
 			if (parseprop(props, optarg) != 0)
 				return (1);
 			break;
 		case 'd':
 			flags.isprefix = B_TRUE;
 			break;
 		case 'e':
 			flags.isprefix = B_TRUE;
 			flags.istail = B_TRUE;
 			break;
 		case 'n':
 			flags.dryrun = B_TRUE;
 			break;
 		case 'u':
 			flags.nomount = B_TRUE;
 			break;
 		case 'v':
 			flags.verbose = B_TRUE;
 			break;
 		case 's':
 			flags.resumable = B_TRUE;
 			break;
 		case 'F':
 			flags.force = B_TRUE;
 			break;
 		case 'A':
 			abort_resumable = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	while ((nvp = nvlist_next_nvpair(props, nvp))) {
 		if (strcmp(nvpair_name(nvp), "origin") != 0) {
 			(void) fprintf(stderr, gettext("invalid option"));
 			usage(B_FALSE);
 		}
 	}
 
 	if (abort_resumable) {
 		if (flags.isprefix || flags.istail || flags.dryrun ||
 		    flags.resumable || flags.nomount) {
 			(void) fprintf(stderr, gettext("invalid option"));
 			usage(B_FALSE);
 		}
 
 		char namebuf[ZFS_MAXNAMELEN];
 		(void) snprintf(namebuf, sizeof (namebuf),
 		    "%s/%%recv", argv[0]);
 
 		if (zfs_dataset_exists(g_zfs, namebuf,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) {
 			zfs_handle_t *zhp = zfs_open(g_zfs,
 			    namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 			if (zhp == NULL)
 				return (1);
 			err = zfs_destroy(zhp, B_FALSE);
 		} else {
 			zfs_handle_t *zhp = zfs_open(g_zfs,
 			    argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 			if (zhp == NULL)
 				usage(B_FALSE);
 			if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) ||
 			    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
 			    NULL, 0, NULL, NULL, 0, B_TRUE) == -1) {
 				(void) fprintf(stderr,
 				    gettext("'%s' does not have any "
 				    "resumable receive state to abort\n"),
 				    argv[0]);
 				return (1);
 			}
 			err = zfs_destroy(zhp, B_FALSE);
 		}
 
 		return (err != 0);
 	}
 
 	if (isatty(STDIN_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Backup stream can not be read "
 		    "from a terminal.\n"
 		    "You must redirect standard input.\n"));
 		return (1);
 	}
 	err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
 
 	return (err != 0);
 }
 
 /*
  * allow/unallow stuff
  */
 /* copied from zfs/sys/dsl_deleg.h */
 #define	ZFS_DELEG_PERM_CREATE		"create"
 #define	ZFS_DELEG_PERM_DESTROY		"destroy"
 #define	ZFS_DELEG_PERM_SNAPSHOT		"snapshot"
 #define	ZFS_DELEG_PERM_ROLLBACK		"rollback"
 #define	ZFS_DELEG_PERM_CLONE		"clone"
 #define	ZFS_DELEG_PERM_PROMOTE		"promote"
 #define	ZFS_DELEG_PERM_RENAME		"rename"
 #define	ZFS_DELEG_PERM_MOUNT		"mount"
 #define	ZFS_DELEG_PERM_SHARE		"share"
 #define	ZFS_DELEG_PERM_SEND		"send"
 #define	ZFS_DELEG_PERM_RECEIVE		"receive"
 #define	ZFS_DELEG_PERM_ALLOW		"allow"
 #define	ZFS_DELEG_PERM_USERPROP		"userprop"
 #define	ZFS_DELEG_PERM_VSCAN		"vscan" /* ??? */
 #define	ZFS_DELEG_PERM_USERQUOTA	"userquota"
 #define	ZFS_DELEG_PERM_GROUPQUOTA	"groupquota"
 #define	ZFS_DELEG_PERM_USERUSED		"userused"
 #define	ZFS_DELEG_PERM_GROUPUSED	"groupused"
 #define	ZFS_DELEG_PERM_HOLD		"hold"
 #define	ZFS_DELEG_PERM_RELEASE		"release"
 #define	ZFS_DELEG_PERM_DIFF		"diff"
 #define	ZFS_DELEG_PERM_BOOKMARK		"bookmark"
 
 #define	ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE
 
 static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
 	{ ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
 	{ ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
 	{ ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
 	{ ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
 	{ ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
 	{ ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
 	{ ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
 	{ ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
 	{ ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
 	{ ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
 	{ ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
 	{ ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
 	{ ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
 	{ ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
 	{ ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
 	{ ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK },
 
 	{ ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
 	{ ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
 	{ ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
 	{ ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
 	{ ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
 	{ NULL, ZFS_DELEG_NOTE_NONE }
 };
 
 /* permission structure */
 typedef struct deleg_perm {
 	zfs_deleg_who_type_t	dp_who_type;
 	const char		*dp_name;
 	boolean_t		dp_local;
 	boolean_t		dp_descend;
 } deleg_perm_t;
 
 /* */
 typedef struct deleg_perm_node {
 	deleg_perm_t		dpn_perm;
 
 	uu_avl_node_t		dpn_avl_node;
 } deleg_perm_node_t;
 
 typedef struct fs_perm fs_perm_t;
 
 /* permissions set */
 typedef struct who_perm {
 	zfs_deleg_who_type_t	who_type;
 	const char		*who_name;		/* id */
 	char			who_ug_name[256];	/* user/group name */
 	fs_perm_t		*who_fsperm;		/* uplink */
 
 	uu_avl_t		*who_deleg_perm_avl;	/* permissions */
 } who_perm_t;
 
 /* */
 typedef struct who_perm_node {
 	who_perm_t	who_perm;
 	uu_avl_node_t	who_avl_node;
 } who_perm_node_t;
 
 typedef struct fs_perm_set fs_perm_set_t;
 /* fs permissions */
 struct fs_perm {
 	const char		*fsp_name;
 
 	uu_avl_t		*fsp_sc_avl;	/* sets,create */
 	uu_avl_t		*fsp_uge_avl;	/* user,group,everyone */
 
 	fs_perm_set_t		*fsp_set;	/* uplink */
 };
 
 /* */
 typedef struct fs_perm_node {
 	fs_perm_t	fspn_fsperm;
 	uu_avl_t	*fspn_avl;
 
 	uu_list_node_t	fspn_list_node;
 } fs_perm_node_t;
 
 /* top level structure */
 struct fs_perm_set {
 	uu_list_pool_t	*fsps_list_pool;
 	uu_list_t	*fsps_list; /* list of fs_perms */
 
 	uu_avl_pool_t	*fsps_named_set_avl_pool;
 	uu_avl_pool_t	*fsps_who_perm_avl_pool;
 	uu_avl_pool_t	*fsps_deleg_perm_avl_pool;
 };
 
 static inline const char *
 deleg_perm_type(zfs_deleg_note_t note)
 {
 	/* subcommands */
 	switch (note) {
 		/* SUBCOMMANDS */
 		/* OTHER */
 	case ZFS_DELEG_NOTE_GROUPQUOTA:
 	case ZFS_DELEG_NOTE_GROUPUSED:
 	case ZFS_DELEG_NOTE_USERPROP:
 	case ZFS_DELEG_NOTE_USERQUOTA:
 	case ZFS_DELEG_NOTE_USERUSED:
 		/* other */
 		return (gettext("other"));
 	default:
 		return (gettext("subcommand"));
 	}
 }
 
 static int inline
 who_type2weight(zfs_deleg_who_type_t who_type)
 {
 	int res;
 	switch (who_type) {
 		case ZFS_DELEG_NAMED_SET_SETS:
 		case ZFS_DELEG_NAMED_SET:
 			res = 0;
 			break;
 		case ZFS_DELEG_CREATE_SETS:
 		case ZFS_DELEG_CREATE:
 			res = 1;
 			break;
 		case ZFS_DELEG_USER_SETS:
 		case ZFS_DELEG_USER:
 			res = 2;
 			break;
 		case ZFS_DELEG_GROUP_SETS:
 		case ZFS_DELEG_GROUP:
 			res = 3;
 			break;
 		case ZFS_DELEG_EVERYONE_SETS:
 		case ZFS_DELEG_EVERYONE:
 			res = 4;
 			break;
 		default:
 			res = -1;
 	}
 
 	return (res);
 }
 
 /* ARGSUSED */
 static int
 who_perm_compare(const void *larg, const void *rarg, void *unused)
 {
 	const who_perm_node_t *l = larg;
 	const who_perm_node_t *r = rarg;
 	zfs_deleg_who_type_t ltype = l->who_perm.who_type;
 	zfs_deleg_who_type_t rtype = r->who_perm.who_type;
 	int lweight = who_type2weight(ltype);
 	int rweight = who_type2weight(rtype);
 	int res = lweight - rweight;
 	if (res == 0)
 		res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
 		    ZFS_MAX_DELEG_NAME-1);
 
 	if (res == 0)
 		return (0);
 	if (res > 0)
 		return (1);
 	else
 		return (-1);
 }
 
 /* ARGSUSED */
 static int
 deleg_perm_compare(const void *larg, const void *rarg, void *unused)
 {
 	const deleg_perm_node_t *l = larg;
 	const deleg_perm_node_t *r = rarg;
 	int res =  strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
 	    ZFS_MAX_DELEG_NAME-1);
 
 	if (res == 0)
 		return (0);
 
 	if (res > 0)
 		return (1);
 	else
 		return (-1);
 }
 
 static inline void
 fs_perm_set_init(fs_perm_set_t *fspset)
 {
 	bzero(fspset, sizeof (fs_perm_set_t));
 
 	if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
 	    sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
 	    NULL, UU_DEFAULT)) == NULL)
 		nomem();
 	if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
 	    UU_DEFAULT)) == NULL)
 		nomem();
 
 	if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
 	    "named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
 	    who_perm_node_t, who_avl_node), who_perm_compare,
 	    UU_DEFAULT)) == NULL)
 		nomem();
 
 	if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
 	    "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
 	    who_perm_node_t, who_avl_node), who_perm_compare,
 	    UU_DEFAULT)) == NULL)
 		nomem();
 
 	if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
 	    "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
 	    deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
 	    == NULL)
 		nomem();
 }
 
 static inline void fs_perm_fini(fs_perm_t *);
 static inline void who_perm_fini(who_perm_t *);
 
 static inline void
 fs_perm_set_fini(fs_perm_set_t *fspset)
 {
 	fs_perm_node_t *node = uu_list_first(fspset->fsps_list);
 
 	while (node != NULL) {
 		fs_perm_node_t *next_node =
 		    uu_list_next(fspset->fsps_list, node);
 		fs_perm_t *fsperm = &node->fspn_fsperm;
 		fs_perm_fini(fsperm);
 		uu_list_remove(fspset->fsps_list, node);
 		free(node);
 		node = next_node;
 	}
 
 	uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
 	uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
 	uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
 }
 
 static inline void
 deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
     const char *name)
 {
 	deleg_perm->dp_who_type = type;
 	deleg_perm->dp_name = name;
 }
 
 static inline void
 who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm,
     zfs_deleg_who_type_t type, const char *name)
 {
 	uu_avl_pool_t	*pool;
 	pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;
 
 	bzero(who_perm, sizeof (who_perm_t));
 
 	if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
 	    UU_DEFAULT)) == NULL)
 		nomem();
 
 	who_perm->who_type = type;
 	who_perm->who_name = name;
 	who_perm->who_fsperm = fsperm;
 }
 
 static inline void
 who_perm_fini(who_perm_t *who_perm)
 {
 	deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);
 
 	while (node != NULL) {
 		deleg_perm_node_t *next_node =
 		    uu_avl_next(who_perm->who_deleg_perm_avl, node);
 
 		uu_avl_remove(who_perm->who_deleg_perm_avl, node);
 		free(node);
 		node = next_node;
 	}
 
 	uu_avl_destroy(who_perm->who_deleg_perm_avl);
 }
 
 static inline void
 fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname)
 {
 	uu_avl_pool_t	*nset_pool = fspset->fsps_named_set_avl_pool;
 	uu_avl_pool_t	*who_pool = fspset->fsps_who_perm_avl_pool;
 
 	bzero(fsperm, sizeof (fs_perm_t));
 
 	if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
 	    == NULL)
 		nomem();
 
 	if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
 	    == NULL)
 		nomem();
 
 	fsperm->fsp_set = fspset;
 	fsperm->fsp_name = fsname;
 }
 
 static inline void
 fs_perm_fini(fs_perm_t *fsperm)
 {
 	who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
 	while (node != NULL) {
 		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
 		    node);
 		who_perm_t *who_perm = &node->who_perm;
 		who_perm_fini(who_perm);
 		uu_avl_remove(fsperm->fsp_sc_avl, node);
 		free(node);
 		node = next_node;
 	}
 
 	node = uu_avl_first(fsperm->fsp_uge_avl);
 	while (node != NULL) {
 		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
 		    node);
 		who_perm_t *who_perm = &node->who_perm;
 		who_perm_fini(who_perm);
 		uu_avl_remove(fsperm->fsp_uge_avl, node);
 		free(node);
 		node = next_node;
 	}
 
 	uu_avl_destroy(fsperm->fsp_sc_avl);
 	uu_avl_destroy(fsperm->fsp_uge_avl);
 }
 
 static void inline
 set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node,
     zfs_deleg_who_type_t who_type, const char *name, char locality)
 {
 	uu_avl_index_t idx = 0;
 
 	deleg_perm_node_t *found_node = NULL;
 	deleg_perm_t	*deleg_perm = &node->dpn_perm;
 
 	deleg_perm_init(deleg_perm, who_type, name);
 
 	if ((found_node = uu_avl_find(avl, node, NULL, &idx))
 	    == NULL)
 		uu_avl_insert(avl, node, idx);
 	else {
 		node = found_node;
 		deleg_perm = &node->dpn_perm;
 	}
 
 
 	switch (locality) {
 	case ZFS_DELEG_LOCAL:
 		deleg_perm->dp_local = B_TRUE;
 		break;
 	case ZFS_DELEG_DESCENDENT:
 		deleg_perm->dp_descend = B_TRUE;
 		break;
 	case ZFS_DELEG_NA:
 		break;
 	default:
 		assert(B_FALSE); /* invalid locality */
 	}
 }
 
 static inline int
 parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality)
 {
 	nvpair_t *nvp = NULL;
 	fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
 	uu_avl_t *avl = who_perm->who_deleg_perm_avl;
 	zfs_deleg_who_type_t who_type = who_perm->who_type;
 
 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 		const char *name = nvpair_name(nvp);
 		data_type_t type = nvpair_type(nvp);
 		uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
 		deleg_perm_node_t *node =
 		    safe_malloc(sizeof (deleg_perm_node_t));
 
 		assert(type == DATA_TYPE_BOOLEAN);
 
 		uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
 		set_deleg_perm_node(avl, node, who_type, name, locality);
 	}
 
 	return (0);
 }
 
 static inline int
 parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl)
 {
 	nvpair_t *nvp = NULL;
 	fs_perm_set_t *fspset = fsperm->fsp_set;
 
 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 		nvlist_t *nvl2 = NULL;
 		const char *name = nvpair_name(nvp);
 		uu_avl_t *avl = NULL;
 		uu_avl_pool_t *avl_pool;
 		zfs_deleg_who_type_t perm_type = name[0];
 		char perm_locality = name[1];
 		const char *perm_name = name + 3;
 		boolean_t is_set = B_TRUE;
 		who_perm_t *who_perm = NULL;
 
 		assert('$' == name[2]);
 
 		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
 			return (-1);
 
 		switch (perm_type) {
 		case ZFS_DELEG_CREATE:
 		case ZFS_DELEG_CREATE_SETS:
 		case ZFS_DELEG_NAMED_SET:
 		case ZFS_DELEG_NAMED_SET_SETS:
 			avl_pool = fspset->fsps_named_set_avl_pool;
 			avl = fsperm->fsp_sc_avl;
 			break;
 		case ZFS_DELEG_USER:
 		case ZFS_DELEG_USER_SETS:
 		case ZFS_DELEG_GROUP:
 		case ZFS_DELEG_GROUP_SETS:
 		case ZFS_DELEG_EVERYONE:
 		case ZFS_DELEG_EVERYONE_SETS:
 			avl_pool = fspset->fsps_who_perm_avl_pool;
 			avl = fsperm->fsp_uge_avl;
 			break;
 		}
 
 		if (is_set) {
 			who_perm_node_t *found_node = NULL;
 			who_perm_node_t *node = safe_malloc(
 			    sizeof (who_perm_node_t));
 			who_perm = &node->who_perm;
 			uu_avl_index_t idx = 0;
 
 			uu_avl_node_init(node, &node->who_avl_node, avl_pool);
 			who_perm_init(who_perm, fsperm, perm_type, perm_name);
 
 			if ((found_node = uu_avl_find(avl, node, NULL, &idx))
 			    == NULL) {
 				if (avl == fsperm->fsp_uge_avl) {
 					uid_t rid = 0;
 					struct passwd *p = NULL;
 					struct group *g = NULL;
 					const char *nice_name = NULL;
 
 					switch (perm_type) {
 					case ZFS_DELEG_USER_SETS:
 					case ZFS_DELEG_USER:
 						rid = atoi(perm_name);
 						p = getpwuid(rid);
 						if (p)
 							nice_name = p->pw_name;
 						break;
 					case ZFS_DELEG_GROUP_SETS:
 					case ZFS_DELEG_GROUP:
 						rid = atoi(perm_name);
 						g = getgrgid(rid);
 						if (g)
 							nice_name = g->gr_name;
 						break;
 					}
 
 					if (nice_name != NULL)
 						(void) strlcpy(
 						    node->who_perm.who_ug_name,
 						    nice_name, 256);
 				}
 
 				uu_avl_insert(avl, node, idx);
 			} else {
 				node = found_node;
 				who_perm = &node->who_perm;
 			}
 		}
 
 		(void) parse_who_perm(who_perm, nvl2, perm_locality);
 	}
 
 	return (0);
 }
 
 static inline int
 parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)
 {
 	nvpair_t *nvp = NULL;
 	uu_avl_index_t idx = 0;
 
 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 		nvlist_t *nvl2 = NULL;
 		const char *fsname = nvpair_name(nvp);
 		data_type_t type = nvpair_type(nvp);
 		fs_perm_t *fsperm = NULL;
 		fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
 		if (node == NULL)
 			nomem();
 
 		fsperm = &node->fspn_fsperm;
 
 		assert(DATA_TYPE_NVLIST == type);
 
 		uu_list_node_init(node, &node->fspn_list_node,
 		    fspset->fsps_list_pool);
 
 		idx = uu_list_numnodes(fspset->fsps_list);
 		fs_perm_init(fsperm, fspset, fsname);
 
 		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
 			return (-1);
 
 		(void) parse_fs_perm(fsperm, nvl2);
 
 		uu_list_insert(fspset->fsps_list, node, idx);
 	}
 
 	return (0);
 }
 
 static inline const char *
 deleg_perm_comment(zfs_deleg_note_t note)
 {
 	const char *str = "";
 
 	/* subcommands */
 	switch (note) {
 		/* SUBCOMMANDS */
 	case ZFS_DELEG_NOTE_ALLOW:
 		str = gettext("Must also have the permission that is being"
 		    "\n\t\t\t\tallowed");
 		break;
 	case ZFS_DELEG_NOTE_CLONE:
 		str = gettext("Must also have the 'create' ability and 'mount'"
 		    "\n\t\t\t\tability in the origin file system");
 		break;
 	case ZFS_DELEG_NOTE_CREATE:
 		str = gettext("Must also have the 'mount' ability");
 		break;
 	case ZFS_DELEG_NOTE_DESTROY:
 		str = gettext("Must also have the 'mount' ability");
 		break;
 	case ZFS_DELEG_NOTE_DIFF:
 		str = gettext("Allows lookup of paths within a dataset;"
 		    "\n\t\t\t\tgiven an object number. Ordinary users need this"
 		    "\n\t\t\t\tin order to use zfs diff");
 		break;
 	case ZFS_DELEG_NOTE_HOLD:
 		str = gettext("Allows adding a user hold to a snapshot");
 		break;
 	case ZFS_DELEG_NOTE_MOUNT:
 		str = gettext("Allows mount/umount of ZFS datasets");
 		break;
 	case ZFS_DELEG_NOTE_PROMOTE:
 		str = gettext("Must also have the 'mount'\n\t\t\t\tand"
 		    " 'promote' ability in the origin file system");
 		break;
 	case ZFS_DELEG_NOTE_RECEIVE:
 		str = gettext("Must also have the 'mount' and 'create'"
 		    " ability");
 		break;
 	case ZFS_DELEG_NOTE_RELEASE:
 		str = gettext("Allows releasing a user hold which\n\t\t\t\t"
 		    "might destroy the snapshot");
 		break;
 	case ZFS_DELEG_NOTE_RENAME:
 		str = gettext("Must also have the 'mount' and 'create'"
 		    "\n\t\t\t\tability in the new parent");
 		break;
 	case ZFS_DELEG_NOTE_ROLLBACK:
 		str = gettext("");
 		break;
 	case ZFS_DELEG_NOTE_SEND:
 		str = gettext("");
 		break;
 	case ZFS_DELEG_NOTE_SHARE:
 		str = gettext("Allows sharing file systems over NFS or SMB"
 		    "\n\t\t\t\tprotocols");
 		break;
 	case ZFS_DELEG_NOTE_SNAPSHOT:
 		str = gettext("");
 		break;
 /*
  *	case ZFS_DELEG_NOTE_VSCAN:
  *		str = gettext("");
  *		break;
  */
 		/* OTHER */
 	case ZFS_DELEG_NOTE_GROUPQUOTA:
 		str = gettext("Allows accessing any groupquota@... property");
 		break;
 	case ZFS_DELEG_NOTE_GROUPUSED:
 		str = gettext("Allows reading any groupused@... property");
 		break;
 	case ZFS_DELEG_NOTE_USERPROP:
 		str = gettext("Allows changing any user property");
 		break;
 	case ZFS_DELEG_NOTE_USERQUOTA:
 		str = gettext("Allows accessing any userquota@... property");
 		break;
 	case ZFS_DELEG_NOTE_USERUSED:
 		str = gettext("Allows reading any userused@... property");
 		break;
 		/* other */
 	default:
 		str = "";
 	}
 
 	return (str);
 }
 
 struct allow_opts {
 	boolean_t local;
 	boolean_t descend;
 	boolean_t user;
 	boolean_t group;
 	boolean_t everyone;
 	boolean_t create;
 	boolean_t set;
 	boolean_t recursive; /* unallow only */
 	boolean_t prt_usage;
 
 	boolean_t prt_perms;
 	char *who;
 	char *perms;
 	const char *dataset;
 };
 
 static inline int
 prop_cmp(const void *a, const void *b)
 {
 	const char *str1 = *(const char **)a;
 	const char *str2 = *(const char **)b;
 	return (strcmp(str1, str2));
 }
 
 static void
 allow_usage(boolean_t un, boolean_t requested, const char *msg)
 {
 	const char *opt_desc[] = {
 		"-h", gettext("show this help message and exit"),
 		"-l", gettext("set permission locally"),
 		"-d", gettext("set permission for descents"),
 		"-u", gettext("set permission for user"),
 		"-g", gettext("set permission for group"),
 		"-e", gettext("set permission for everyone"),
 		"-c", gettext("set create time permission"),
 		"-s", gettext("define permission set"),
 		/* unallow only */
 		"-r", gettext("remove permissions recursively"),
 	};
 	size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
 	size_t allow_size = unallow_size - 2;
 	const char *props[ZFS_NUM_PROPS];
 	int i;
 	size_t count = 0;
 	FILE *fp = requested ? stdout : stderr;
 	zprop_desc_t *pdtbl = zfs_prop_get_table();
 	const char *fmt = gettext("%-16s %-14s\t%s\n");
 
 	(void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
 	    HELP_ALLOW));
 	(void) fprintf(fp, gettext("Options:\n"));
 	for (i = 0; i < (un ? unallow_size : allow_size); i++) {
 		const char *opt = opt_desc[i++];
 		const char *optdsc = opt_desc[i];
 		(void) fprintf(fp, gettext("  %-10s  %s\n"), opt, optdsc);
 	}
 
 	(void) fprintf(fp, gettext("\nThe following permissions are "
 	    "supported:\n\n"));
 	(void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
 	    gettext("NOTES"));
 	for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
 		const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
 		zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
 		const char *perm_type = deleg_perm_type(perm_note);
 		const char *perm_comment = deleg_perm_comment(perm_note);
 		(void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
 	}
 
 	for (i = 0; i < ZFS_NUM_PROPS; i++) {
 		zprop_desc_t *pd = &pdtbl[i];
 		if (pd->pd_visible != B_TRUE)
 			continue;
 
 		if (pd->pd_attr == PROP_READONLY)
 			continue;
 
 		props[count++] = pd->pd_name;
 	}
 	props[count] = NULL;
 
 	qsort(props, count, sizeof (char *), prop_cmp);
 
 	for (i = 0; i < count; i++)
 		(void) fprintf(fp, fmt, props[i], gettext("property"), "");
 
 	if (msg != NULL)
 		(void) fprintf(fp, gettext("\nzfs: error: %s"), msg);
 
 	exit(requested ? 0 : 2);
 }
 
 static inline const char *
 munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
     char **permsp)
 {
 	if (un && argc == expected_argc - 1)
 		*permsp = NULL;
 	else if (argc == expected_argc)
 		*permsp = argv[argc - 2];
 	else
 		allow_usage(un, B_FALSE,
 		    gettext("wrong number of parameters\n"));
 
 	return (argv[argc - 1]);
 }
 
 static void
 parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts)
 {
 	int uge_sum = opts->user + opts->group + opts->everyone;
 	int csuge_sum = opts->create + opts->set + uge_sum;
 	int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
 	int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;
 
 	if (uge_sum > 1)
 		allow_usage(un, B_FALSE,
 		    gettext("-u, -g, and -e are mutually exclusive\n"));
 
 	if (opts->prt_usage)
 		if (argc == 0 && all_sum == 0)
 			allow_usage(un, B_TRUE, NULL);
 		else
 			usage(B_FALSE);
 
 	if (opts->set) {
 		if (csuge_sum > 1)
 			allow_usage(un, B_FALSE,
 			    gettext("invalid options combined with -s\n"));
 
 		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
 		if (argv[0][0] != '@')
 			allow_usage(un, B_FALSE,
 			    gettext("invalid set name: missing '@' prefix\n"));
 		opts->who = argv[0];
 	} else if (opts->create) {
 		if (ldcsuge_sum > 1)
 			allow_usage(un, B_FALSE,
 			    gettext("invalid options combined with -c\n"));
 		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
 	} else if (opts->everyone) {
 		if (csuge_sum > 1)
 			allow_usage(un, B_FALSE,
 			    gettext("invalid options combined with -e\n"));
 		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
 	} else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
 	    == 0) {
 		opts->everyone = B_TRUE;
 		argc--;
 		argv++;
 		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
 	} else if (argc == 1 && !un) {
 		opts->prt_perms = B_TRUE;
 		opts->dataset = argv[argc-1];
 	} else {
 		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
 		opts->who = argv[0];
 	}
 
 	if (!opts->local && !opts->descend) {
 		opts->local = B_TRUE;
 		opts->descend = B_TRUE;
 	}
 }
 
 static void
 store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
     const char *who, char *perms, nvlist_t *top_nvl)
 {
 	int i;
 	char ld[2] = { '\0', '\0' };
 	char who_buf[ZFS_MAXNAMELEN+32];
 	char base_type;
 	char set_type;
 	nvlist_t *base_nvl = NULL;
 	nvlist_t *set_nvl = NULL;
 	nvlist_t *nvl;
 
 	if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 	if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) !=  0)
 		nomem();
 
 	switch (type) {
 	case ZFS_DELEG_NAMED_SET_SETS:
 	case ZFS_DELEG_NAMED_SET:
 		set_type = ZFS_DELEG_NAMED_SET_SETS;
 		base_type = ZFS_DELEG_NAMED_SET;
 		ld[0] = ZFS_DELEG_NA;
 		break;
 	case ZFS_DELEG_CREATE_SETS:
 	case ZFS_DELEG_CREATE:
 		set_type = ZFS_DELEG_CREATE_SETS;
 		base_type = ZFS_DELEG_CREATE;
 		ld[0] = ZFS_DELEG_NA;
 		break;
 	case ZFS_DELEG_USER_SETS:
 	case ZFS_DELEG_USER:
 		set_type = ZFS_DELEG_USER_SETS;
 		base_type = ZFS_DELEG_USER;
 		if (local)
 			ld[0] = ZFS_DELEG_LOCAL;
 		if (descend)
 			ld[1] = ZFS_DELEG_DESCENDENT;
 		break;
 	case ZFS_DELEG_GROUP_SETS:
 	case ZFS_DELEG_GROUP:
 		set_type = ZFS_DELEG_GROUP_SETS;
 		base_type = ZFS_DELEG_GROUP;
 		if (local)
 			ld[0] = ZFS_DELEG_LOCAL;
 		if (descend)
 			ld[1] = ZFS_DELEG_DESCENDENT;
 		break;
 	case ZFS_DELEG_EVERYONE_SETS:
 	case ZFS_DELEG_EVERYONE:
 		set_type = ZFS_DELEG_EVERYONE_SETS;
 		base_type = ZFS_DELEG_EVERYONE;
 		if (local)
 			ld[0] = ZFS_DELEG_LOCAL;
 		if (descend)
 			ld[1] = ZFS_DELEG_DESCENDENT;
 	}
 
 	if (perms != NULL) {
 		char *curr = perms;
 		char *end = curr + strlen(perms);
 
 		while (curr < end) {
 			char *delim = strchr(curr, ',');
 			if (delim == NULL)
 				delim = end;
 			else
 				*delim = '\0';
 
 			if (curr[0] == '@')
 				nvl = set_nvl;
 			else
 				nvl = base_nvl;
 
 			(void) nvlist_add_boolean(nvl, curr);
 			if (delim != end)
 				*delim = ',';
 			curr = delim + 1;
 		}
 
 		for (i = 0; i < 2; i++) {
 			char locality = ld[i];
 			if (locality == 0)
 				continue;
 
 			if (!nvlist_empty(base_nvl)) {
 				if (who != NULL)
 					(void) snprintf(who_buf,
 					    sizeof (who_buf), "%c%c$%s",
 					    base_type, locality, who);
 				else
 					(void) snprintf(who_buf,
 					    sizeof (who_buf), "%c%c$",
 					    base_type, locality);
 
 				(void) nvlist_add_nvlist(top_nvl, who_buf,
 				    base_nvl);
 			}
 
 
 			if (!nvlist_empty(set_nvl)) {
 				if (who != NULL)
 					(void) snprintf(who_buf,
 					    sizeof (who_buf), "%c%c$%s",
 					    set_type, locality, who);
 				else
 					(void) snprintf(who_buf,
 					    sizeof (who_buf), "%c%c$",
 					    set_type, locality);
 
 				(void) nvlist_add_nvlist(top_nvl, who_buf,
 				    set_nvl);
 			}
 		}
 	} else {
 		for (i = 0; i < 2; i++) {
 			char locality = ld[i];
 			if (locality == 0)
 				continue;
 
 			if (who != NULL)
 				(void) snprintf(who_buf, sizeof (who_buf),
 				    "%c%c$%s", base_type, locality, who);
 			else
 				(void) snprintf(who_buf, sizeof (who_buf),
 				    "%c%c$", base_type, locality);
 			(void) nvlist_add_boolean(top_nvl, who_buf);
 
 			if (who != NULL)
 				(void) snprintf(who_buf, sizeof (who_buf),
 				    "%c%c$%s", set_type, locality, who);
 			else
 				(void) snprintf(who_buf, sizeof (who_buf),
 				    "%c%c$", set_type, locality);
 			(void) nvlist_add_boolean(top_nvl, who_buf);
 		}
 	}
 }
 
 static int
 construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
 {
 	if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 
 	if (opts->set) {
 		store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
 		    opts->descend, opts->who, opts->perms, *nvlp);
 	} else if (opts->create) {
 		store_allow_perm(ZFS_DELEG_CREATE, opts->local,
 		    opts->descend, NULL, opts->perms, *nvlp);
 	} else if (opts->everyone) {
 		store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
 		    opts->descend, NULL, opts->perms, *nvlp);
 	} else {
 		char *curr = opts->who;
 		char *end = curr + strlen(curr);
 
 		while (curr < end) {
 			const char *who;
 			zfs_deleg_who_type_t who_type;
 			char *endch;
 			char *delim = strchr(curr, ',');
 			char errbuf[256];
 			char id[64];
 			struct passwd *p = NULL;
 			struct group *g = NULL;
 
 			uid_t rid;
 			if (delim == NULL)
 				delim = end;
 			else
 				*delim = '\0';
 
 			rid = (uid_t)strtol(curr, &endch, 0);
 			if (opts->user) {
 				who_type = ZFS_DELEG_USER;
 				if (*endch != '\0')
 					p = getpwnam(curr);
 				else
 					p = getpwuid(rid);
 
 				if (p != NULL)
 					rid = p->pw_uid;
 				else {
 					(void) snprintf(errbuf, 256, gettext(
 					    "invalid user %s"), curr);
 					allow_usage(un, B_TRUE, errbuf);
 				}
 			} else if (opts->group) {
 				who_type = ZFS_DELEG_GROUP;
 				if (*endch != '\0')
 					g = getgrnam(curr);
 				else
 					g = getgrgid(rid);
 
 				if (g != NULL)
 					rid = g->gr_gid;
 				else {
 					(void) snprintf(errbuf, 256, gettext(
 					    "invalid group %s"),  curr);
 					allow_usage(un, B_TRUE, errbuf);
 				}
 			} else {
 				if (*endch != '\0') {
 					p = getpwnam(curr);
 				} else {
 					p = getpwuid(rid);
 				}
 
 				if (p == NULL)
 					if (*endch != '\0') {
 						g = getgrnam(curr);
 					} else {
 						g = getgrgid(rid);
 					}
 
 				if (p != NULL) {
 					who_type = ZFS_DELEG_USER;
 					rid = p->pw_uid;
 				} else if (g != NULL) {
 					who_type = ZFS_DELEG_GROUP;
 					rid = g->gr_gid;
 				} else {
 					(void) snprintf(errbuf, 256, gettext(
 					    "invalid user/group %s"), curr);
 					allow_usage(un, B_TRUE, errbuf);
 				}
 			}
 
 			(void) sprintf(id, "%u", rid);
 			who = id;
 
 			store_allow_perm(who_type, opts->local,
 			    opts->descend, who, opts->perms, *nvlp);
 			curr = delim + 1;
 		}
 	}
 
 	return (0);
 }
 
 static void
 print_set_creat_perms(uu_avl_t *who_avl)
 {
 	const char *sc_title[] = {
 		gettext("Permission sets:\n"),
 		gettext("Create time permissions:\n"),
 		NULL
 	};
 	const char **title_ptr = sc_title;
 	who_perm_node_t *who_node = NULL;
 	int prev_weight = -1;
 
 	for (who_node = uu_avl_first(who_avl); who_node != NULL;
 	    who_node = uu_avl_next(who_avl, who_node)) {
 		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
 		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
 		const char *who_name = who_node->who_perm.who_name;
 		int weight = who_type2weight(who_type);
 		boolean_t first = B_TRUE;
 		deleg_perm_node_t *deleg_node;
 
 		if (prev_weight != weight) {
 			(void) printf(*title_ptr++);
 			prev_weight = weight;
 		}
 
 		if (who_name == NULL || strnlen(who_name, 1) == 0)
 			(void) printf("\t");
 		else
 			(void) printf("\t%s ", who_name);
 
 		for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
 		    deleg_node = uu_avl_next(avl, deleg_node)) {
 			if (first) {
 				(void) printf("%s",
 				    deleg_node->dpn_perm.dp_name);
 				first = B_FALSE;
 			} else
 				(void) printf(",%s",
 				    deleg_node->dpn_perm.dp_name);
 		}
 
 		(void) printf("\n");
 	}
 }
 
 static void inline
 print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
     const char *title)
 {
 	who_perm_node_t *who_node = NULL;
 	boolean_t prt_title = B_TRUE;
 	uu_avl_walk_t *walk;
 
 	if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
 		nomem();
 
 	while ((who_node = uu_avl_walk_next(walk)) != NULL) {
 		const char *who_name = who_node->who_perm.who_name;
 		const char *nice_who_name = who_node->who_perm.who_ug_name;
 		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
 		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
 		char delim = ' ';
 		deleg_perm_node_t *deleg_node;
 		boolean_t prt_who = B_TRUE;
 
 		for (deleg_node = uu_avl_first(avl);
 		    deleg_node != NULL;
 		    deleg_node = uu_avl_next(avl, deleg_node)) {
 			if (local != deleg_node->dpn_perm.dp_local ||
 			    descend != deleg_node->dpn_perm.dp_descend)
 				continue;
 
 			if (prt_who) {
 				const char *who = NULL;
 				if (prt_title) {
 					prt_title = B_FALSE;
 					(void) printf(title);
 				}
 
 				switch (who_type) {
 				case ZFS_DELEG_USER_SETS:
 				case ZFS_DELEG_USER:
 					who = gettext("user");
 					if (nice_who_name)
 						who_name  = nice_who_name;
 					break;
 				case ZFS_DELEG_GROUP_SETS:
 				case ZFS_DELEG_GROUP:
 					who = gettext("group");
 					if (nice_who_name)
 						who_name  = nice_who_name;
 					break;
 				case ZFS_DELEG_EVERYONE_SETS:
 				case ZFS_DELEG_EVERYONE:
 					who = gettext("everyone");
 					who_name = NULL;
 				}
 
 				prt_who = B_FALSE;
 				if (who_name == NULL)
 					(void) printf("\t%s", who);
 				else
 					(void) printf("\t%s %s", who, who_name);
 			}
 
 			(void) printf("%c%s", delim,
 			    deleg_node->dpn_perm.dp_name);
 			delim = ',';
 		}
 
 		if (!prt_who)
 			(void) printf("\n");
 	}
 
 	uu_avl_walk_end(walk);
 }
 
 static void
 print_fs_perms(fs_perm_set_t *fspset)
 {
 	fs_perm_node_t *node = NULL;
 	char buf[ZFS_MAXNAMELEN+32];
 	const char *dsname = buf;
 
 	for (node = uu_list_first(fspset->fsps_list); node != NULL;
 	    node = uu_list_next(fspset->fsps_list, node)) {
 		uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
 		uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
 		int left = 0;
 
 		(void) snprintf(buf, ZFS_MAXNAMELEN+32,
 		    gettext("---- Permissions on %s "),
 		    node->fspn_fsperm.fsp_name);
 		(void) printf(dsname);
 		left = 70 - strlen(buf);
 		while (left-- > 0)
 			(void) printf("-");
 		(void) printf("\n");
 
 		print_set_creat_perms(sc_avl);
 		print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
 		    gettext("Local permissions:\n"));
 		print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
 		    gettext("Descendent permissions:\n"));
 		print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
 		    gettext("Local+Descendent permissions:\n"));
 	}
 }
 
 static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };
 
 struct deleg_perms {
 	boolean_t un;
 	nvlist_t *nvl;
 };
 
 static int
 set_deleg_perms(zfs_handle_t *zhp, void *data)
 {
 	struct deleg_perms *perms = (struct deleg_perms *)data;
 	zfs_type_t zfs_type = zfs_get_type(zhp);
 
 	if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
 		return (0);
 
 	return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
 }
 
 static int
 zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
 {
 	zfs_handle_t *zhp;
 	nvlist_t *perm_nvl = NULL;
 	nvlist_t *update_perm_nvl = NULL;
 	int error = 1;
 	int c;
 	struct allow_opts opts = { 0 };
 
 	const char *optstr = un ? "ldugecsrh" : "ldugecsh";
 
 	/* check opts */
 	while ((c = getopt(argc, argv, optstr)) != -1) {
 		switch (c) {
 		case 'l':
 			opts.local = B_TRUE;
 			break;
 		case 'd':
 			opts.descend = B_TRUE;
 			break;
 		case 'u':
 			opts.user = B_TRUE;
 			break;
 		case 'g':
 			opts.group = B_TRUE;
 			break;
 		case 'e':
 			opts.everyone = B_TRUE;
 			break;
 		case 's':
 			opts.set = B_TRUE;
 			break;
 		case 'c':
 			opts.create = B_TRUE;
 			break;
 		case 'r':
 			opts.recursive = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case 'h':
 			opts.prt_usage = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check arguments */
 	parse_allow_args(argc, argv, un, &opts);
 
 	/* try to open the dataset */
 	if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM |
 	    ZFS_TYPE_VOLUME)) == NULL) {
 		(void) fprintf(stderr, "Failed to open dataset: %s\n",
 		    opts.dataset);
 		return (-1);
 	}
 
 	if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
 		goto cleanup2;
 
 	fs_perm_set_init(&fs_perm_set);
 	if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
 		(void) fprintf(stderr, "Failed to parse fsacl permissions\n");
 		goto cleanup1;
 	}
 
 	if (opts.prt_perms)
 		print_fs_perms(&fs_perm_set);
 	else {
 		(void) construct_fsacl_list(un, &opts, &update_perm_nvl);
 		if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
 			goto cleanup0;
 
 		if (un && opts.recursive) {
 			struct deleg_perms data = { un, update_perm_nvl };
 			if (zfs_iter_filesystems(zhp, set_deleg_perms,
 			    &data) != 0)
 				goto cleanup0;
 		}
 	}
 
 	error = 0;
 
 cleanup0:
 	nvlist_free(perm_nvl);
 	if (update_perm_nvl != NULL)
 		nvlist_free(update_perm_nvl);
 cleanup1:
 	fs_perm_set_fini(&fs_perm_set);
 cleanup2:
 	zfs_close(zhp);
 
 	return (error);
 }
 
 static int
 zfs_do_allow(int argc, char **argv)
 {
 	return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
 }
 
 static int
 zfs_do_unallow(int argc, char **argv)
 {
 	return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
 }
 
 static int
 zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
 {
 	int errors = 0;
 	int i;
 	const char *tag;
 	boolean_t recursive = B_FALSE;
 	const char *opts = holding ? "rt" : "r";
 	int c;
 
 	/* check options */
 	while ((c = getopt(argc, argv, opts)) != -1) {
 		switch (c) {
 		case 'r':
 			recursive = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 2)
 		usage(B_FALSE);
 
 	tag = argv[0];
 	--argc;
 	++argv;
 
 	if (holding && tag[0] == '.') {
 		/* tags starting with '.' are reserved for libzfs */
 		(void) fprintf(stderr, gettext("tag may not start with '.'\n"));
 		usage(B_FALSE);
 	}
 
 	for (i = 0; i < argc; ++i) {
 		zfs_handle_t *zhp;
 		char parent[ZFS_MAXNAMELEN];
 		const char *delim;
 		char *path = argv[i];
 
 		delim = strchr(path, '@');
 		if (delim == NULL) {
 			(void) fprintf(stderr,
 			    gettext("'%s' is not a snapshot\n"), path);
 			++errors;
 			continue;
 		}
 		(void) strncpy(parent, path, delim - path);
 		parent[delim - path] = '\0';
 
 		zhp = zfs_open(g_zfs, parent,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		if (zhp == NULL) {
 			++errors;
 			continue;
 		}
 		if (holding) {
 			if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0)
 				++errors;
 		} else {
 			if (zfs_release(zhp, delim+1, tag, recursive) != 0)
 				++errors;
 		}
 		zfs_close(zhp);
 	}
 
 	return (errors != 0);
 }
 
 /*
  * zfs hold [-r] [-t] <tag> <snap> ...
  *
  *	-r	Recursively hold
  *
  * Apply a user-hold with the given tag to the list of snapshots.
  */
 static int
 zfs_do_hold(int argc, char **argv)
 {
 	return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
 }
 
 /*
  * zfs release [-r] <tag> <snap> ...
  *
  *	-r	Recursively release
  *
  * Release a user-hold with the given tag from the list of snapshots.
  */
 static int
 zfs_do_release(int argc, char **argv)
 {
 	return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
 }
 
 typedef struct holds_cbdata {
 	boolean_t	cb_recursive;
 	const char	*cb_snapname;
 	nvlist_t	**cb_nvlp;
 	size_t		cb_max_namelen;
 	size_t		cb_max_taglen;
 } holds_cbdata_t;
 
 #define	STRFTIME_FMT_STR "%a %b %e %k:%M %Y"
 #define	DATETIME_BUF_LEN (32)
 /*
  *
  */
 static void
 print_holds(boolean_t scripted, boolean_t literal, size_t nwidth,
     size_t tagwidth, nvlist_t *nvl)
 {
 	int i;
 	nvpair_t *nvp = NULL;
 	char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
 	const char *col;
 
 	if (!scripted) {
 		for (i = 0; i < 3; i++) {
 			col = gettext(hdr_cols[i]);
 			if (i < 2)
 				(void) printf("%-*s  ", i ? tagwidth : nwidth,
 				    col);
 			else
 				(void) printf("%s\n", col);
 		}
 	}
 
 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 		char *zname = nvpair_name(nvp);
 		nvlist_t *nvl2;
 		nvpair_t *nvp2 = NULL;
 		(void) nvpair_value_nvlist(nvp, &nvl2);
 		while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
 			char tsbuf[DATETIME_BUF_LEN];
 			char *tagname = nvpair_name(nvp2);
 			uint64_t val = 0;
 			time_t time;
 			struct tm t;
 			char sep = scripted ? '\t' : ' ';
 			size_t sepnum = scripted ? 1 : 2;
 
 			(void) nvpair_value_uint64(nvp2, &val);
 			if (literal)
 				snprintf(tsbuf, DATETIME_BUF_LEN, "%llu", val);
 			else {
 				time = (time_t)val;
 				(void) localtime_r(&time, &t);
 				(void) strftime(tsbuf, DATETIME_BUF_LEN,
 				    gettext(STRFTIME_FMT_STR), &t);
 			}
 
 			(void) printf("%-*s%*c%-*s%*c%s\n", nwidth, zname,
 			    sepnum, sep, tagwidth, tagname, sepnum, sep, tsbuf);
 		}
 	}
 }
 
 /*
  * Generic callback function to list a dataset or snapshot.
  */
 static int
 holds_callback(zfs_handle_t *zhp, void *data)
 {
 	holds_cbdata_t *cbp = data;
 	nvlist_t *top_nvl = *cbp->cb_nvlp;
 	nvlist_t *nvl = NULL;
 	nvpair_t *nvp = NULL;
 	const char *zname = zfs_get_name(zhp);
 	size_t znamelen = strnlen(zname, ZFS_MAXNAMELEN);
 
 	if (cbp->cb_recursive && cbp->cb_snapname != NULL) {
 		const char *snapname;
 		char *delim  = strchr(zname, '@');
 		if (delim == NULL)
 			return (0);
 
 		snapname = delim + 1;
 		if (strcmp(cbp->cb_snapname, snapname))
 			return (0);
 	}
 
 	if (zfs_get_holds(zhp, &nvl) != 0)
 		return (-1);
 
 	if (znamelen > cbp->cb_max_namelen)
 		cbp->cb_max_namelen  = znamelen;
 
 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 		const char *tag = nvpair_name(nvp);
 		size_t taglen = strnlen(tag, MAXNAMELEN);
 		if (taglen > cbp->cb_max_taglen)
 			cbp->cb_max_taglen  = taglen;
 	}
 
 	return (nvlist_add_nvlist(top_nvl, zname, nvl));
 }
 
 /*
  * zfs holds [-Hp] [-r | -d max] <dataset|snap> ...
  *
  *	-H	Suppress header output
  *	-p	Output literal values
  *	-r	Recursively search for holds
  *	-d max	Limit depth of recursive search
  */
 static int
 zfs_do_holds(int argc, char **argv)
 {
 	int errors = 0;
 	int c;
 	int i;
 	boolean_t scripted = B_FALSE;
 	boolean_t literal = B_FALSE;
 	boolean_t recursive = B_FALSE;
 	const char *opts = "d:rHp";
 	nvlist_t *nvl;
 
 	int types = ZFS_TYPE_SNAPSHOT;
 	holds_cbdata_t cb = { 0 };
 
 	int limit = 0;
 	int ret = 0;
 	int flags = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, opts)) != -1) {
 		switch (c) {
 		case 'd':
 			limit = parse_depth(optarg, &flags);
 			recursive = B_TRUE;
 			break;
 		case 'r':
 			recursive = B_TRUE;
 			break;
 		case 'H':
 			scripted = B_TRUE;
 			break;
 		case 'p':
 			literal = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	if (recursive) {
 		types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
 		flags |= ZFS_ITER_RECURSE;
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1)
 		usage(B_FALSE);
 
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
 		nomem();
 
 	for (i = 0; i < argc; ++i) {
 		char *snapshot = argv[i];
 		const char *delim;
 		const char *snapname = NULL;
 
 		delim = strchr(snapshot, '@');
 		if (delim != NULL) {
 			snapname = delim + 1;
 			if (recursive)
 				snapshot[delim - snapshot] = '\0';
 		}
 
 		cb.cb_recursive = recursive;
 		cb.cb_snapname = snapname;
 		cb.cb_nvlp = &nvl;
 
 		/*
 		 *  1. collect holds data, set format options
 		 */
 		ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
 		    holds_callback, &cb);
 		if (ret != 0)
 			++errors;
 	}
 
 	/*
 	 *  2. print holds data
 	 */
 	print_holds(scripted, literal, cb.cb_max_namelen, cb.cb_max_taglen,
 	    nvl);
 
 	if (nvlist_empty(nvl))
 		(void) printf(gettext("no datasets available\n"));
 
 	nvlist_free(nvl);
 
 	return (0 != errors);
 }
 
 #define	CHECK_SPINNER 30
 #define	SPINNER_TIME 3		/* seconds */
 #define	MOUNT_TIME 5		/* seconds */
 
 static int
 get_one_dataset(zfs_handle_t *zhp, void *data)
 {
 	static char *spin[] = { "-", "\\", "|", "/" };
 	static int spinval = 0;
 	static int spincheck = 0;
 	static time_t last_spin_time = (time_t)0;
 	get_all_cb_t *cbp = data;
 	zfs_type_t type = zfs_get_type(zhp);
 
 	if (cbp->cb_verbose) {
 		if (--spincheck < 0) {
 			time_t now = time(NULL);
 			if (last_spin_time + SPINNER_TIME < now) {
 				update_progress(spin[spinval++ % 4]);
 				last_spin_time = now;
 			}
 			spincheck = CHECK_SPINNER;
 		}
 	}
 
 	/*
 	 * Interate over any nested datasets.
 	 */
 	if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
 
 	/*
 	 * Skip any datasets whose type does not match.
 	 */
 	if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 	libzfs_add_handle(cbp, zhp);
 	assert(cbp->cb_used <= cbp->cb_alloc);
 
 	return (0);
 }
 
 static void
 get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)
 {
 	get_all_cb_t cb = { 0 };
 	cb.cb_verbose = verbose;
 	cb.cb_getone = get_one_dataset;
 
 	if (verbose)
 		set_progress_header(gettext("Reading ZFS config"));
 	(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
 
 	*dslist = cb.cb_handles;
 	*count = cb.cb_used;
 
 	if (verbose)
 		finish_progress(gettext("done."));
 }
 
 /*
  * Generic callback for sharing or mounting filesystems.  Because the code is so
  * similar, we have a common function with an extra parameter to determine which
  * mode we are using.
  */
 #define	OP_SHARE	0x1
 #define	OP_MOUNT	0x2
 
 /*
  * Share or mount a dataset.
  */
 static int
 share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
     boolean_t explicit, const char *options)
 {
 	char mountpoint[ZFS_MAXPROPLEN];
 	char shareopts[ZFS_MAXPROPLEN];
 	char smbshareopts[ZFS_MAXPROPLEN];
 	const char *cmdname = op == OP_SHARE ? "share" : "mount";
 	struct mnttab mnt;
 	uint64_t zoned, canmount;
 	boolean_t shared_nfs, shared_smb;
 
 	assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);
 
 	/*
 	 * Check to make sure we can mount/share this dataset.  If we
 	 * are in the global zone and the filesystem is exported to a
 	 * local zone, or if we are in a local zone and the
 	 * filesystem is not exported, then it is an error.
 	 */
 	zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 
 	if (zoned && getzoneid() == GLOBAL_ZONEID) {
 		if (!explicit)
 			return (0);
 
 		(void) fprintf(stderr, gettext("cannot %s '%s': "
 		    "dataset is exported to a local zone\n"), cmdname,
 		    zfs_get_name(zhp));
 		return (1);
 
 	} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
 		if (!explicit)
 			return (0);
 
 		(void) fprintf(stderr, gettext("cannot %s '%s': "
 		    "permission denied\n"), cmdname,
 		    zfs_get_name(zhp));
 		return (1);
 	}
 
 	/*
 	 * Ignore any filesystems which don't apply to us. This
 	 * includes those with a legacy mountpoint, or those with
 	 * legacy share options.
 	 */
 	verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
 	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
 	verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
 	    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
 	verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
 	    sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
 
 	if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
 	    strcmp(smbshareopts, "off") == 0) {
 		if (!explicit)
 			return (0);
 
 		(void) fprintf(stderr, gettext("cannot share '%s': "
 		    "legacy share\n"), zfs_get_name(zhp));
 		(void) fprintf(stderr, gettext("to "
 		    "share this filesystem set "
 		    "sharenfs property on\n"));
 		return (1);
 	}
 
 	/*
 	 * We cannot share or mount legacy filesystems. If the
 	 * shareopts is non-legacy but the mountpoint is legacy, we
 	 * treat it as a legacy share.
 	 */
 	if (strcmp(mountpoint, "legacy") == 0) {
 		if (!explicit)
 			return (0);
 
 		(void) fprintf(stderr, gettext("cannot %s '%s': "
 		    "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
 		(void) fprintf(stderr, gettext("use %s(8) to "
 		    "%s this filesystem\n"), cmdname, cmdname);
 		return (1);
 	}
 
 	if (strcmp(mountpoint, "none") == 0) {
 		if (!explicit)
 			return (0);
 
 		(void) fprintf(stderr, gettext("cannot %s '%s': no "
 		    "mountpoint set\n"), cmdname, zfs_get_name(zhp));
 		return (1);
 	}
 
 	/*
 	 * canmount	explicit	outcome
 	 * on		no		pass through
 	 * on		yes		pass through
 	 * off		no		return 0
 	 * off		yes		display error, return 1
 	 * noauto	no		return 0
 	 * noauto	yes		pass through
 	 */
 	canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
 	if (canmount == ZFS_CANMOUNT_OFF) {
 		if (!explicit)
 			return (0);
 
 		(void) fprintf(stderr, gettext("cannot %s '%s': "
 		    "'canmount' property is set to 'off'\n"), cmdname,
 		    zfs_get_name(zhp));
 		return (1);
 	} else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
 		return (0);
 	}
 
 	/*
 	 * If this filesystem is inconsistent and has a receive resume
 	 * token, we can not mount it.
 	 */
 	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
 	    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	    NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
 		if (!explicit)
 			return (0);
 
 		(void) fprintf(stderr, gettext("cannot %s '%s': "
 		    "Contains partially-completed state from "
 		    "\"zfs receive -r\", which can be resumed with "
 		    "\"zfs send -t\"\n"),
 		    cmdname, zfs_get_name(zhp));
 		return (1);
 	}
 
 	/*
 	 * At this point, we have verified that the mountpoint and/or
 	 * shareopts are appropriate for auto management. If the
 	 * filesystem is already mounted or shared, return (failing
 	 * for explicit requests); otherwise mount or share the
 	 * filesystem.
 	 */
 	switch (op) {
 	case OP_SHARE:
 
 		shared_nfs = zfs_is_shared_nfs(zhp, NULL);
 		shared_smb = zfs_is_shared_smb(zhp, NULL);
 
 		if (shared_nfs && shared_smb ||
 		    (shared_nfs && strcmp(shareopts, "on") == 0 &&
 		    strcmp(smbshareopts, "off") == 0) ||
 		    (shared_smb && strcmp(smbshareopts, "on") == 0 &&
 		    strcmp(shareopts, "off") == 0)) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot share "
 			    "'%s': filesystem already shared\n"),
 			    zfs_get_name(zhp));
 			return (1);
 		}
 
 		if (!zfs_is_mounted(zhp, NULL) &&
 		    zfs_mount(zhp, NULL, 0) != 0)
 			return (1);
 
 		if (protocol == NULL) {
 			if (zfs_shareall(zhp) != 0)
 				return (1);
 		} else if (strcmp(protocol, "nfs") == 0) {
 			if (zfs_share_nfs(zhp))
 				return (1);
 		} else if (strcmp(protocol, "smb") == 0) {
 			if (zfs_share_smb(zhp))
 				return (1);
 		} else {
 			(void) fprintf(stderr, gettext("cannot share "
 			    "'%s': invalid share type '%s' "
 			    "specified\n"),
 			    zfs_get_name(zhp), protocol);
 			return (1);
 		}
 
 		break;
 
 	case OP_MOUNT:
 		if (options == NULL)
 			mnt.mnt_mntopts = "";
 		else
 			mnt.mnt_mntopts = (char *)options;
 
 		if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
 		    zfs_is_mounted(zhp, NULL)) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot mount "
 			    "'%s': filesystem already mounted\n"),
 			    zfs_get_name(zhp));
 			return (1);
 		}
 
 		if (zfs_mount(zhp, options, flags) != 0)
 			return (1);
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Reports progress in the form "(current/total)".  Not thread-safe.
  */
 static void
 report_mount_progress(int current, int total)
 {
 	static time_t last_progress_time = 0;
 	time_t now = time(NULL);
 	char info[32];
 
 	/* report 1..n instead of 0..n-1 */
 	++current;
 
 	/* display header if we're here for the first time */
 	if (current == 1) {
 		set_progress_header(gettext("Mounting ZFS filesystems"));
 	} else if (current != total && last_progress_time + MOUNT_TIME >= now) {
 		/* too soon to report again */
 		return;
 	}
 
 	last_progress_time = now;
 
 	(void) sprintf(info, "(%d/%d)", current, total);
 
 	if (current == total)
 		finish_progress(info);
 	else
 		update_progress(info);
 }
 
 static void
 append_options(char *mntopts, char *newopts)
 {
 	int len = strlen(mntopts);
 
 	/* original length plus new string to append plus 1 for the comma */
 	if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) {
 		(void) fprintf(stderr, gettext("the opts argument for "
 		    "'%c' option is too long (more than %d chars)\n"),
 		    "-o", MNT_LINE_MAX);
 		usage(B_FALSE);
 	}
 
 	if (*mntopts)
 		mntopts[len++] = ',';
 
 	(void) strcpy(&mntopts[len], newopts);
 }
 
 static int
 share_mount(int op, int argc, char **argv)
 {
 	int do_all = 0;
 	boolean_t verbose = B_FALSE;
 	int c, ret = 0;
 	char *options = NULL;
 	int flags = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a"))
 	    != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
 			break;
 		case 'v':
 			verbose = B_TRUE;
 			break;
 		case 'o':
 			if (*optarg == '\0') {
 				(void) fprintf(stderr, gettext("empty mount "
 				    "options (-o) specified\n"));
 				usage(B_FALSE);
 			}
 
 			if (options == NULL)
 				options = safe_malloc(MNT_LINE_MAX + 1);
 
 			/* option validation is done later */
 			append_options(options, optarg);
 			break;
 
 		case 'O':
 			warnx("no overlay mounts support on FreeBSD, ignoring");
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (do_all) {
 		zfs_handle_t **dslist = NULL;
 		size_t i, count = 0;
 		char *protocol = NULL;
 
 		if (op == OP_SHARE && argc > 0) {
 			if (strcmp(argv[0], "nfs") != 0 &&
 			    strcmp(argv[0], "smb") != 0) {
 				(void) fprintf(stderr, gettext("share type "
 				    "must be 'nfs' or 'smb'\n"));
 				usage(B_FALSE);
 			}
 			protocol = argv[0];
 			argc--;
 			argv++;
 		}
 
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		start_progress_timer();
 		get_all_datasets(&dslist, &count, verbose);
 
 		if (count == 0)
 			return (0);
 
 		qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
 
 		for (i = 0; i < count; i++) {
 			if (verbose)
 				report_mount_progress(i, count);
 
 			if (share_mount_one(dslist[i], op, flags, protocol,
 			    B_FALSE, options) != 0)
 				ret = 1;
 			zfs_close(dslist[i]);
 		}
 
 		free(dslist);
 	} else if (argc == 0) {
 		struct mnttab entry;
 
 		if ((op == OP_SHARE) || (options != NULL)) {
 			(void) fprintf(stderr, gettext("missing filesystem "
 			    "argument (specify -a for all)\n"));
 			usage(B_FALSE);
 		}
 
 		/*
 		 * When mount is given no arguments, go through /etc/mnttab and
 		 * display any active ZFS mounts.  We hide any snapshots, since
 		 * they are controlled automatically.
 		 */
 		rewind(mnttab_file);
 		while (getmntent(mnttab_file, &entry) == 0) {
 			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 ||
 			    strchr(entry.mnt_special, '@') != NULL)
 				continue;
 
 			(void) printf("%-30s  %s\n", entry.mnt_special,
 			    entry.mnt_mountp);
 		}
 
 	} else {
 		zfs_handle_t *zhp;
 
 		if (argc > 1) {
 			(void) fprintf(stderr,
 			    gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		if ((zhp = zfs_open(g_zfs, argv[0],
 		    ZFS_TYPE_FILESYSTEM)) == NULL) {
 			ret = 1;
 		} else {
 			ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
 			    options);
 			zfs_close(zhp);
 		}
 	}
 
 	return (ret);
 }
 
 /*
  * zfs mount -a [nfs]
  * zfs mount filesystem
  *
  * Mount all filesystems, or mount the given filesystem.
  */
 static int
 zfs_do_mount(int argc, char **argv)
 {
 	return (share_mount(OP_MOUNT, argc, argv));
 }
 
 /*
  * zfs share -a [nfs | smb]
  * zfs share filesystem
  *
  * Share all filesystems, or share the given filesystem.
  */
 static int
 zfs_do_share(int argc, char **argv)
 {
 	return (share_mount(OP_SHARE, argc, argv));
 }
 
 typedef struct unshare_unmount_node {
 	zfs_handle_t	*un_zhp;
 	char		*un_mountp;
 	uu_avl_node_t	un_avlnode;
 } unshare_unmount_node_t;
 
 /* ARGSUSED */
 static int
 unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
 {
 	const unshare_unmount_node_t *l = larg;
 	const unshare_unmount_node_t *r = rarg;
 
 	return (strcmp(l->un_mountp, r->un_mountp));
 }
 
 /*
  * Convenience routine used by zfs_do_umount() and manual_unmount().  Given an
  * absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem,
  * and unmount it appropriately.
  */
 static int
 unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
 {
 	zfs_handle_t *zhp;
 	int ret = 0;
 	struct stat64 statbuf;
 	struct extmnttab entry;
 	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
 	ino_t path_inode;
 
 	/*
 	 * Search for the path in /etc/mnttab.  Rather than looking for the
 	 * specific path, which can be fooled by non-standard paths (i.e. ".."
 	 * or "//"), we stat() the path and search for the corresponding
 	 * (major,minor) device pair.
 	 */
 	if (stat64(path, &statbuf) != 0) {
 		(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
 		    cmdname, path, strerror(errno));
 		return (1);
 	}
 	path_inode = statbuf.st_ino;
 
 	/*
 	 * Search for the given (major,minor) pair in the mount table.
 	 */
 #ifdef illumos
 	rewind(mnttab_file);
 	while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) {
 		if (entry.mnt_major == major(statbuf.st_dev) &&
 		    entry.mnt_minor == minor(statbuf.st_dev))
 			break;
 	}
 #else
 	{
 		struct statfs sfs;
 
 		if (statfs(path, &sfs) != 0) {
 			(void) fprintf(stderr, "%s: %s\n", path,
 			    strerror(errno));
 			ret = -1;
 		}
 		statfs2mnttab(&sfs, &entry);
 	}
 #endif
 	if (ret != 0) {
 		if (op == OP_SHARE) {
 			(void) fprintf(stderr, gettext("cannot %s '%s': not "
 			    "currently mounted\n"), cmdname, path);
 			return (1);
 		}
 		(void) fprintf(stderr, gettext("warning: %s not in mnttab\n"),
 		    path);
 		if ((ret = umount2(path, flags)) != 0)
 			(void) fprintf(stderr, gettext("%s: %s\n"), path,
 			    strerror(errno));
 		return (ret != 0);
 	}
 
 	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
 		(void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
 		    "filesystem\n"), cmdname, path);
 		return (1);
 	}
 
 	if ((zhp = zfs_open(g_zfs, entry.mnt_special,
 	    ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (1);
 
 	ret = 1;
 	if (stat64(entry.mnt_mountp, &statbuf) != 0) {
 		(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
 		    cmdname, path, strerror(errno));
 		goto out;
 	} else if (statbuf.st_ino != path_inode) {
 		(void) fprintf(stderr, gettext("cannot "
 		    "%s '%s': not a mountpoint\n"), cmdname, path);
 		goto out;
 	}
 
 	if (op == OP_SHARE) {
 		char nfs_mnt_prop[ZFS_MAXPROPLEN];
 		char smbshare_prop[ZFS_MAXPROPLEN];
 
 		verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop,
 		    sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0);
 		verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop,
 		    sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0);
 
 		if (strcmp(nfs_mnt_prop, "off") == 0 &&
 		    strcmp(smbshare_prop, "off") == 0) {
 			(void) fprintf(stderr, gettext("cannot unshare "
 			    "'%s': legacy share\n"), path);
 #ifdef illumos
 			(void) fprintf(stderr, gettext("use "
 			    "unshare(1M) to unshare this filesystem\n"));
 #endif
 		} else if (!zfs_is_shared(zhp)) {
 			(void) fprintf(stderr, gettext("cannot unshare '%s': "
 			    "not currently shared\n"), path);
 		} else {
 			ret = zfs_unshareall_bypath(zhp, path);
 		}
 	} else {
 		char mtpt_prop[ZFS_MAXPROPLEN];
 
 		verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop,
 		    sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0);
 
 		if (is_manual) {
 			ret = zfs_unmount(zhp, NULL, flags);
 		} else if (strcmp(mtpt_prop, "legacy") == 0) {
 			(void) fprintf(stderr, gettext("cannot unmount "
 			    "'%s': legacy mountpoint\n"),
 			    zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use umount(8) "
 			    "to unmount this filesystem\n"));
 		} else {
 			ret = zfs_unmountall(zhp, flags);
 		}
 	}
 
 out:
 	zfs_close(zhp);
 
 	return (ret != 0);
 }
 
 /*
  * Generic callback for unsharing or unmounting a filesystem.
  */
 static int
 unshare_unmount(int op, int argc, char **argv)
 {
 	int do_all = 0;
 	int flags = 0;
 	int ret = 0;
 	int c;
 	zfs_handle_t *zhp;
 	char nfs_mnt_prop[ZFS_MAXPROPLEN];
 	char sharesmb[ZFS_MAXPROPLEN];
 
 	/* check options */
 	while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
 			break;
 		case 'f':
 			flags = MS_FORCE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (do_all) {
 		/*
 		 * We could make use of zfs_for_each() to walk all datasets in
 		 * the system, but this would be very inefficient, especially
 		 * since we would have to linearly search /etc/mnttab for each
 		 * one.  Instead, do one pass through /etc/mnttab looking for
 		 * zfs entries and call zfs_unmount() for each one.
 		 *
 		 * Things get a little tricky if the administrator has created
 		 * mountpoints beneath other ZFS filesystems.  In this case, we
 		 * have to unmount the deepest filesystems first.  To accomplish
 		 * this, we place all the mountpoints in an AVL tree sorted by
 		 * the special type (dataset name), and walk the result in
 		 * reverse to make sure to get any snapshots first.
 		 */
 		struct mnttab entry;
 		uu_avl_pool_t *pool;
 		uu_avl_t *tree;
 		unshare_unmount_node_t *node;
 		uu_avl_index_t idx;
 		uu_avl_walk_t *walk;
 
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		if (((pool = uu_avl_pool_create("unmount_pool",
 		    sizeof (unshare_unmount_node_t),
 		    offsetof(unshare_unmount_node_t, un_avlnode),
 		    unshare_unmount_compare, UU_DEFAULT)) == NULL) ||
 		    ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
 			nomem();
 
 		rewind(mnttab_file);
 		while (getmntent(mnttab_file, &entry) == 0) {
 
 			/* ignore non-ZFS entries */
 			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
 				continue;
 
 			/* ignore snapshots */
 			if (strchr(entry.mnt_special, '@') != NULL)
 				continue;
 
 			if ((zhp = zfs_open(g_zfs, entry.mnt_special,
 			    ZFS_TYPE_FILESYSTEM)) == NULL) {
 				ret = 1;
 				continue;
 			}
 
 			switch (op) {
 			case OP_SHARE:
 				verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
 				    nfs_mnt_prop,
 				    sizeof (nfs_mnt_prop),
 				    NULL, NULL, 0, B_FALSE) == 0);
 				if (strcmp(nfs_mnt_prop, "off") != 0)
 					break;
 				verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
 				    nfs_mnt_prop,
 				    sizeof (nfs_mnt_prop),
 				    NULL, NULL, 0, B_FALSE) == 0);
 				if (strcmp(nfs_mnt_prop, "off") == 0)
 					continue;
 				break;
 			case OP_MOUNT:
 				/* Ignore legacy mounts */
 				verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
 				    nfs_mnt_prop,
 				    sizeof (nfs_mnt_prop),
 				    NULL, NULL, 0, B_FALSE) == 0);
 				if (strcmp(nfs_mnt_prop, "legacy") == 0)
 					continue;
 				/* Ignore canmount=noauto mounts */
 				if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
 				    ZFS_CANMOUNT_NOAUTO)
 					continue;
 			default:
 				break;
 			}
 
 			node = safe_malloc(sizeof (unshare_unmount_node_t));
 			node->un_zhp = zhp;
 			node->un_mountp = safe_strdup(entry.mnt_mountp);
 
 			uu_avl_node_init(node, &node->un_avlnode, pool);
 
 			if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
 				uu_avl_insert(tree, node, idx);
 			} else {
 				zfs_close(node->un_zhp);
 				free(node->un_mountp);
 				free(node);
 			}
 		}
 
 		/*
 		 * Walk the AVL tree in reverse, unmounting each filesystem and
 		 * removing it from the AVL tree in the process.
 		 */
 		if ((walk = uu_avl_walk_start(tree,
 		    UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL)
 			nomem();
 
 		while ((node = uu_avl_walk_next(walk)) != NULL) {
 			uu_avl_remove(tree, node);
 
 			switch (op) {
 			case OP_SHARE:
 				if (zfs_unshareall_bypath(node->un_zhp,
 				    node->un_mountp) != 0)
 					ret = 1;
 				break;
 
 			case OP_MOUNT:
 				if (zfs_unmount(node->un_zhp,
 				    node->un_mountp, flags) != 0)
 					ret = 1;
 				break;
 			}
 
 			zfs_close(node->un_zhp);
 			free(node->un_mountp);
 			free(node);
 		}
 
 		uu_avl_walk_end(walk);
 		uu_avl_destroy(tree);
 		uu_avl_pool_destroy(pool);
 
 	} else {
 		if (argc != 1) {
 			if (argc == 0)
 				(void) fprintf(stderr,
 				    gettext("missing filesystem argument\n"));
 			else
 				(void) fprintf(stderr,
 				    gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		/*
 		 * We have an argument, but it may be a full path or a ZFS
 		 * filesystem.  Pass full paths off to unmount_path() (shared by
 		 * manual_unmount), otherwise open the filesystem and pass to
 		 * zfs_unmount().
 		 */
 		if (argv[0][0] == '/')
 			return (unshare_unmount_path(op, argv[0],
 			    flags, B_FALSE));
 
 		if ((zhp = zfs_open(g_zfs, argv[0],
 		    ZFS_TYPE_FILESYSTEM)) == NULL)
 			return (1);
 
 		verify(zfs_prop_get(zhp, op == OP_SHARE ?
 		    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
 		    nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
 		    NULL, 0, B_FALSE) == 0);
 
 		switch (op) {
 		case OP_SHARE:
 			verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
 			    nfs_mnt_prop,
 			    sizeof (nfs_mnt_prop),
 			    NULL, NULL, 0, B_FALSE) == 0);
 			verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
 			    sharesmb, sizeof (sharesmb), NULL, NULL,
 			    0, B_FALSE) == 0);
 
 			if (strcmp(nfs_mnt_prop, "off") == 0 &&
 			    strcmp(sharesmb, "off") == 0) {
 				(void) fprintf(stderr, gettext("cannot "
 				    "unshare '%s': legacy share\n"),
 				    zfs_get_name(zhp));
 #ifdef illumos
 				(void) fprintf(stderr, gettext("use "
 				    "unshare(1M) to unshare this "
 				    "filesystem\n"));
 #endif
 				ret = 1;
 			} else if (!zfs_is_shared(zhp)) {
 				(void) fprintf(stderr, gettext("cannot "
 				    "unshare '%s': not currently "
 				    "shared\n"), zfs_get_name(zhp));
 				ret = 1;
 			} else if (zfs_unshareall(zhp) != 0) {
 				ret = 1;
 			}
 			break;
 
 		case OP_MOUNT:
 			if (strcmp(nfs_mnt_prop, "legacy") == 0) {
 				(void) fprintf(stderr, gettext("cannot "
 				    "unmount '%s': legacy "
 				    "mountpoint\n"), zfs_get_name(zhp));
 				(void) fprintf(stderr, gettext("use "
 				    "umount(8) to unmount this "
 				    "filesystem\n"));
 				ret = 1;
 			} else if (!zfs_is_mounted(zhp, NULL)) {
 				(void) fprintf(stderr, gettext("cannot "
 				    "unmount '%s': not currently "
 				    "mounted\n"),
 				    zfs_get_name(zhp));
 				ret = 1;
 			} else if (zfs_unmountall(zhp, flags) != 0) {
 				ret = 1;
 			}
 			break;
 		}
 
 		zfs_close(zhp);
 	}
 
 	return (ret);
 }
 
 /*
  * zfs unmount -a
  * zfs unmount filesystem
  *
  * Unmount all filesystems, or a specific ZFS filesystem.
  */
 static int
 zfs_do_unmount(int argc, char **argv)
 {
 	return (unshare_unmount(OP_MOUNT, argc, argv));
 }
 
 /*
  * zfs unshare -a
  * zfs unshare filesystem
  *
  * Unshare all filesystems, or a specific ZFS filesystem.
  */
 static int
 zfs_do_unshare(int argc, char **argv)
 {
 	return (unshare_unmount(OP_SHARE, argc, argv));
 }
 
 /*
  * Attach/detach the given dataset to/from the given jail
  */
 /* ARGSUSED */
 static int
 do_jail(int argc, char **argv, int attach)
 {
 	zfs_handle_t *zhp;
 	int jailid, ret;
 
 	/* check number of arguments */
 	if (argc < 3) {
 		(void) fprintf(stderr, gettext("missing argument(s)\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 3) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	jailid = jail_getid(argv[1]);
 	if (jailid < 0) {
 		(void) fprintf(stderr, gettext("invalid jail id or name\n"));
 		usage(B_FALSE);
 	}
 
 	zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
 	if (zhp == NULL)
 		return (1);
 
 	ret = (zfs_jail(zhp, jailid, attach) != 0);
 
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * zfs jail jailid filesystem
  *
  * Attach the given dataset to the given jail
  */
 /* ARGSUSED */
 static int
 zfs_do_jail(int argc, char **argv)
 {
 
 	return (do_jail(argc, argv, 1));
 }
 
 /*
  * zfs unjail jailid filesystem
  *
  * Detach the given dataset from the given jail
  */
 /* ARGSUSED */
 static int
 zfs_do_unjail(int argc, char **argv)
 {
 
 	return (do_jail(argc, argv, 0));
 }
 
 /*
  * Called when invoked as /etc/fs/zfs/mount.  Do the mount if the mountpoint is
  * 'legacy'.  Otherwise, complain that use should be using 'zfs mount'.
  */
 static int
 manual_mount(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	char mountpoint[ZFS_MAXPROPLEN];
 	char mntopts[MNT_LINE_MAX] = { '\0' };
 	int ret = 0;
 	int c;
 	int flags = 0;
 	char *dataset, *path;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":mo:O")) != -1) {
 		switch (c) {
 		case 'o':
 			(void) strlcpy(mntopts, optarg, sizeof (mntopts));
 			break;
 		case 'O':
 			flags |= MS_OVERLAY;
 			break;
 		case 'm':
 			flags |= MS_NOMNTTAB;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			(void) fprintf(stderr, gettext("usage: mount [-o opts] "
 			    "<path>\n"));
 			return (2);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check that we only have two arguments */
 	if (argc != 2) {
 		if (argc == 0)
 			(void) fprintf(stderr, gettext("missing dataset "
 			    "argument\n"));
 		else if (argc == 1)
 			(void) fprintf(stderr,
 			    gettext("missing mountpoint argument\n"));
 		else
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 		(void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
 		return (2);
 	}
 
 	dataset = argv[0];
 	path = argv[1];
 
 	/* try to open the dataset */
 	if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (1);
 
 	(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
 	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE);
 
 	/* check for legacy mountpoint and complain appropriately */
 	ret = 0;
 	if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
 		if (zmount(dataset, path, flags, MNTTYPE_ZFS,
 		    NULL, 0, mntopts, sizeof (mntopts)) != 0) {
 			(void) fprintf(stderr, gettext("mount failed: %s\n"),
 			    strerror(errno));
 			ret = 1;
 		}
 	} else {
 		(void) fprintf(stderr, gettext("filesystem '%s' cannot be "
 		    "mounted using 'mount -t zfs'\n"), dataset);
 		(void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' "
 		    "instead.\n"), path);
 		(void) fprintf(stderr, gettext("If you must use 'mount -t zfs' "
 		    "or /etc/fstab, use 'zfs set mountpoint=legacy'.\n"));
 		(void) fprintf(stderr, gettext("See zfs(8) for more "
 		    "information.\n"));
 		ret = 1;
 	}
 
 	return (ret);
 }
 
 /*
  * Called when invoked as /etc/fs/zfs/umount.  Unlike a manual mount, we allow
  * unmounts of non-legacy filesystems, as this is the dominant administrative
  * interface.
  */
 static int
 manual_unmount(int argc, char **argv)
 {
 	int flags = 0;
 	int c;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "f")) != -1) {
 		switch (c) {
 		case 'f':
 			flags = MS_FORCE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			(void) fprintf(stderr, gettext("usage: unmount [-f] "
 			    "<path>\n"));
 			return (2);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check arguments */
 	if (argc != 1) {
 		if (argc == 0)
 			(void) fprintf(stderr, gettext("missing path "
 			    "argument\n"));
 		else
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 		(void) fprintf(stderr, gettext("usage: unmount [-f] <path>\n"));
 		return (2);
 	}
 
 	return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE));
 }
 
 static int
 find_command_idx(char *command, int *idx)
 {
 	int i;
 
 	for (i = 0; i < NCOMMAND; i++) {
 		if (command_table[i].name == NULL)
 			continue;
 
 		if (strcmp(command, command_table[i].name) == 0) {
 			*idx = i;
 			return (0);
 		}
 	}
 	return (1);
 }
 
 static int
 zfs_do_diff(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	int flags = 0;
 	char *tosnap = NULL;
 	char *fromsnap = NULL;
 	char *atp, *copy;
 	int err = 0;
 	int c;
 
 	while ((c = getopt(argc, argv, "FHt")) != -1) {
 		switch (c) {
 		case 'F':
 			flags |= ZFS_DIFF_CLASSIFY;
 			break;
 		case 'H':
 			flags |= ZFS_DIFF_PARSEABLE;
 			break;
 		case 't':
 			flags |= ZFS_DIFF_TIMESTAMP;
 			break;
 		default:
 			(void) fprintf(stderr,
 			    gettext("invalid option '%c'\n"), optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr,
 		gettext("must provide at least one snapshot name\n"));
 		usage(B_FALSE);
 	}
 
 	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	fromsnap = argv[0];
 	tosnap = (argc == 2) ? argv[1] : NULL;
 
 	copy = NULL;
 	if (*fromsnap != '@')
 		copy = strdup(fromsnap);
 	else if (tosnap)
 		copy = strdup(tosnap);
 	if (copy == NULL)
 		usage(B_FALSE);
 
 	if (atp = strchr(copy, '@'))
 		*atp = '\0';
 
 	if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (1);
 
 	free(copy);
 
 	/*
 	 * Ignore SIGPIPE so that the library can give us
 	 * information on any failure
 	 */
 	(void) sigignore(SIGPIPE);
 
 	err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);
 
 	zfs_close(zhp);
 
 	return (err != 0);
 }
 
 /*
  * zfs bookmark <fs@snap> <fs#bmark>
  *
  * Creates a bookmark with the given name from the given snapshot.
  */
 static int
 zfs_do_bookmark(int argc, char **argv)
 {
 	char snapname[ZFS_MAXNAMELEN];
 	zfs_handle_t *zhp;
 	nvlist_t *nvl;
 	int ret = 0;
 	int c;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "")) != -1) {
 		switch (c) {
 		case '?':
 			(void) fprintf(stderr,
 			    gettext("invalid option '%c'\n"), optopt);
 			goto usage;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		goto usage;
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing bookmark argument\n"));
 		goto usage;
 	}
 
 	if (strchr(argv[1], '#') == NULL) {
 		(void) fprintf(stderr,
 		    gettext("invalid bookmark name '%s' -- "
 		    "must contain a '#'\n"), argv[1]);
 		goto usage;
 	}
 
 	if (argv[0][0] == '@') {
 		/*
 		 * Snapshot name begins with @.
 		 * Default to same fs as bookmark.
 		 */
 		(void) strncpy(snapname, argv[1], sizeof (snapname));
 		*strchr(snapname, '#') = '\0';
 		(void) strlcat(snapname, argv[0], sizeof (snapname));
 	} else {
 		(void) strncpy(snapname, argv[0], sizeof (snapname));
 	}
 	zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT);
 	if (zhp == NULL)
 		goto usage;
 	zfs_close(zhp);
 
 
 	nvl = fnvlist_alloc();
 	fnvlist_add_string(nvl, argv[1], snapname);
 	ret = lzc_bookmark(nvl, NULL);
 	fnvlist_free(nvl);
 
 	if (ret != 0) {
 		const char *err_msg;
 		char errbuf[1024];
 
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN,
 		    "cannot create bookmark '%s'"), argv[1]);
 
 		switch (ret) {
 		case EXDEV:
 			err_msg = "bookmark is in a different pool";
 			break;
 		case EEXIST:
 			err_msg = "bookmark exists";
 			break;
 		case EINVAL:
 			err_msg = "invalid argument";
 			break;
 		case ENOTSUP:
 			err_msg = "bookmark feature not enabled";
 			break;
 		case ENOSPC:
 			err_msg = "out of space";
 			break;
 		default:
 			err_msg = "unknown error";
 			break;
 		}
 		(void) fprintf(stderr, "%s: %s\n", errbuf,
 		    dgettext(TEXT_DOMAIN, err_msg));
 	}
 
 	return (ret != 0);
 
 usage:
 	usage(B_FALSE);
 	return (-1);
 }
 
 int
 main(int argc, char **argv)
 {
 	int ret = 0;
 	int i;
 	char *progname;
 	char *cmdname;
 
 	(void) setlocale(LC_ALL, "");
 	(void) textdomain(TEXT_DOMAIN);
 
 	opterr = 0;
 
 	if ((g_zfs = libzfs_init()) == NULL) {
 		(void) fprintf(stderr, gettext("internal error: failed to "
 		    "initialize ZFS library\n"));
 		return (1);
 	}
 
 	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
 
 	libzfs_print_on_error(g_zfs, B_TRUE);
 
 	if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) {
 		(void) fprintf(stderr, gettext("internal error: unable to "
 		    "open %s\n"), MNTTAB);
 		return (1);
 	}
 
 	/*
 	 * This command also doubles as the /etc/fs mount and unmount program.
 	 * Determine if we should take this behavior based on argv[0].
 	 */
 	progname = basename(argv[0]);
 	if (strcmp(progname, "mount") == 0) {
 		ret = manual_mount(argc, argv);
 	} else if (strcmp(progname, "umount") == 0) {
 		ret = manual_unmount(argc, argv);
 	} else {
 		/*
 		 * Make sure the user has specified some command.
 		 */
 		if (argc < 2) {
 			(void) fprintf(stderr, gettext("missing command\n"));
 			usage(B_FALSE);
 		}
 
 		cmdname = argv[1];
 
 		/*
 		 * The 'umount' command is an alias for 'unmount'
 		 */
 		if (strcmp(cmdname, "umount") == 0)
 			cmdname = "unmount";
 
 		/*
 		 * The 'recv' command is an alias for 'receive'
 		 */
 		if (strcmp(cmdname, "recv") == 0)
 			cmdname = "receive";
 
 		/*
 		 * The 'snap' command is an alias for 'snapshot'
 		 */
 		if (strcmp(cmdname, "snap") == 0)
 			cmdname = "snapshot";
 
 		/*
 		 * Special case '-?'
 		 */
 		if (strcmp(cmdname, "-?") == 0)
 			usage(B_TRUE);
 
 		/*
 		 * Run the appropriate command.
 		 */
 		libzfs_mnttab_cache(g_zfs, B_TRUE);
 		if (find_command_idx(cmdname, &i) == 0) {
 			current_command = &command_table[i];
 			ret = command_table[i].func(argc - 1, argv + 1);
 		} else if (strchr(cmdname, '=') != NULL) {
 			verify(find_command_idx("set", &i) == 0);
 			current_command = &command_table[i];
 			ret = command_table[i].func(argc, argv);
 		} else {
 			(void) fprintf(stderr, gettext("unrecognized "
 			    "command '%s'\n"), cmdname);
 			usage(B_FALSE);
 		}
 		libzfs_mnttab_cache(g_zfs, B_FALSE);
 	}
 
 	(void) fclose(mnttab_file);
 
 	if (ret == 0 && log_history)
 		(void) zpool_log_history(g_zfs, history_str);
 
 	libzfs_fini(g_zfs);
 
 	/*
 	 * The 'ZFS_ABORT' environment variable causes us to dump core on exit
 	 * for the purposes of running ::findleaks.
 	 */
 	if (getenv("ZFS_ABORT") != NULL) {
 		(void) printf("dumping core by request\n");
 		abort();
 	}
 
 	return (ret);
 }
Index: head/cddl/contrib/opensolaris/cmd/zfs
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zfs	(revision 296518)
+++ head/cddl/contrib/opensolaris/cmd/zfs	(revision 296519)

Property changes on: head/cddl/contrib/opensolaris/cmd/zfs
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/illumos/dist/cmd/zfs:r296518
Index: head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c	(revision 296518)
+++ head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c	(revision 296519)
@@ -1,627 +1,628 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <ctype.h>
 #include <libnvpair.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <stddef.h>
 
 #include <sys/dmu.h>
 #include <sys/zfs_ioctl.h>
 #include <zfs_fletcher.h>
 
 /*
  * If dump mode is enabled, the number of bytes to print per line
  */
 #define	BYTES_PER_LINE	16
 /*
  * If dump mode is enabled, the number of bytes to group together, separated
  * by newlines or spaces
  */
 #define	DUMP_GROUPING	4
 
 uint64_t total_write_size = 0;
 uint64_t total_stream_len = 0;
 FILE *send_stream = 0;
 boolean_t do_byteswap = B_FALSE;
 boolean_t do_cksum = B_TRUE;
 
 static void
 usage(void)
 {
 	(void) fprintf(stderr, "usage: zstreamdump [-v] [-C] [-d] < file\n");
 	(void) fprintf(stderr, "\t -v -- verbose\n");
 	(void) fprintf(stderr, "\t -C -- suppress checksum verification\n");
 	(void) fprintf(stderr, "\t -d -- dump contents of blocks modified, "
 	    "implies verbose\n");
 	exit(1);
 }
 
 static void *
 safe_malloc(size_t size)
 {
 	void *rv = malloc(size);
 	if (rv == NULL) {
 		(void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n",
 		    size);
 		abort();
 	}
 	return (rv);
 }
 
 /*
  * ssread - send stream read.
  *
  * Read while computing incremental checksum
  */
 static size_t
 ssread(void *buf, size_t len, zio_cksum_t *cksum)
 {
 	size_t outlen;
 
 	if ((outlen = fread(buf, len, 1, send_stream)) == 0)
 		return (0);
 
 	if (do_cksum) {
 		if (do_byteswap)
 			fletcher_4_incremental_byteswap(buf, len, cksum);
 		else
 			fletcher_4_incremental_native(buf, len, cksum);
 	}
 	total_stream_len += len;
 	return (outlen);
 }
 
 static size_t
 read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum)
 {
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum);
 	if (r == 0)
 		return (0);
 	zio_cksum_t saved_cksum = *cksum;
 	r = ssread(&drr->drr_u.drr_checksum.drr_checksum,
 	    sizeof (zio_cksum_t), cksum);
 	if (r == 0)
 		return (0);
 	if (!ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) &&
 	    !ZIO_CHECKSUM_EQUAL(saved_cksum,
 	    drr->drr_u.drr_checksum.drr_checksum)) {
 		fprintf(stderr, "invalid checksum\n");
 		(void) printf("Incorrect checksum in record header.\n");
 		(void) printf("Expected checksum = %llx/%llx/%llx/%llx\n",
 		    saved_cksum.zc_word[0],
 		    saved_cksum.zc_word[1],
 		    saved_cksum.zc_word[2],
 		    saved_cksum.zc_word[3]);
 		return (0);
 	}
 	return (sizeof (*drr));
 }
 
 /*
  * Print part of a block in ASCII characters
  */
 static void
 print_ascii_block(char *subbuf, int length)
 {
 	int i;
 
 	for (i = 0; i < length; i++) {
 		char char_print = isprint(subbuf[i]) ? subbuf[i] : '.';
 		if (i != 0 && i % DUMP_GROUPING == 0) {
 			(void) printf(" ");
 		}
 		(void) printf("%c", char_print);
 	}
 	(void) printf("\n");
 }
 
 /*
  * print_block - Dump the contents of a modified block to STDOUT
  *
  * Assume that buf has capacity evenly divisible by BYTES_PER_LINE
  */
 static void
 print_block(char *buf, int length)
 {
 	int i;
 	/*
 	 * Start printing ASCII characters at a constant offset, after
 	 * the hex prints. Leave 3 characters per byte on a line (2 digit
 	 * hex number plus 1 space) plus spaces between characters and
 	 * groupings.
 	 */
 	int ascii_start = BYTES_PER_LINE * 3 +
 	    BYTES_PER_LINE / DUMP_GROUPING + 2;
 
 	for (i = 0; i < length; i += BYTES_PER_LINE) {
 		int j;
 		int this_line_length = MIN(BYTES_PER_LINE, length - i);
 		int print_offset = 0;
 
 		for (j = 0; j < this_line_length; j++) {
 			int buf_offset = i + j;
 
 			/*
 			 * Separate every DUMP_GROUPING bytes by a space.
 			 */
 			if (buf_offset % DUMP_GROUPING == 0) {
 				print_offset += printf(" ");
 			}
 
 			/*
 			 * Print the two-digit hex value for this byte.
 			 */
 			unsigned char hex_print = buf[buf_offset];
 			print_offset += printf("%02x ", hex_print);
 		}
 
 		(void) printf("%*s", ascii_start - print_offset, " ");
 
 		print_ascii_block(buf + i, this_line_length);
 	}
 }
 
 int
 main(int argc, char *argv[])
 {
 	char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
 	uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
 	uint64_t total_records = 0;
 	dmu_replay_record_t thedrr;
 	dmu_replay_record_t *drr = &thedrr;
 	struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
 	struct drr_end *drre = &thedrr.drr_u.drr_end;
 	struct drr_object *drro = &thedrr.drr_u.drr_object;
 	struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects;
 	struct drr_write *drrw = &thedrr.drr_u.drr_write;
 	struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
 	struct drr_free *drrf = &thedrr.drr_u.drr_free;
 	struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
 	struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
 	struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum;
 	char c;
 	boolean_t verbose = B_FALSE;
 	boolean_t very_verbose = B_FALSE;
 	boolean_t first = B_TRUE;
 	/*
 	 * dump flag controls whether the contents of any modified data blocks
 	 * are printed to the console during processing of the stream. Warning:
 	 * for large streams, this can obviously lead to massive prints.
 	 */
 	boolean_t dump = B_FALSE;
 	int err;
 	zio_cksum_t zc = { 0 };
 	zio_cksum_t pcksum = { 0 };
 
 	while ((c = getopt(argc, argv, ":vCd")) != -1) {
 		switch (c) {
 		case 'C':
 			do_cksum = B_FALSE;
 			break;
 		case 'v':
 			if (verbose)
 				very_verbose = B_TRUE;
 			verbose = B_TRUE;
 			break;
 		case 'd':
 			dump = B_TRUE;
 			verbose = B_TRUE;
 			very_verbose = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr,
 			    "missing argument for '%c' option\n", optopt);
 			usage();
 			break;
 		case '?':
 			(void) fprintf(stderr, "invalid option '%c'\n",
 			    optopt);
 			usage();
 		}
 	}
 
 	if (isatty(STDIN_FILENO)) {
 		(void) fprintf(stderr,
 		    "Error: Backup stream can not be read "
 		    "from a terminal.\n"
 		    "You must redirect standard input.\n");
 		exit(1);
 	}
 
 	send_stream = stdin;
 	pcksum = zc;
 	while (read_hdr(drr, &zc)) {
 
 		/*
 		 * If this is the first DMU record being processed, check for
 		 * the magic bytes and figure out the endian-ness based on them.
 		 */
 		if (first) {
 			if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 				do_byteswap = B_TRUE;
 				if (do_cksum) {
 					ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
 					/*
 					 * recalculate header checksum now
 					 * that we know it needs to be
 					 * byteswapped.
 					 */
 					fletcher_4_incremental_byteswap(drr,
 					    sizeof (dmu_replay_record_t), &zc);
 				}
 			} else if (drrb->drr_magic != DMU_BACKUP_MAGIC) {
 				(void) fprintf(stderr, "Invalid stream "
 				    "(bad magic number)\n");
 				exit(1);
 			}
 			first = B_FALSE;
 		}
 		if (do_byteswap) {
 			drr->drr_type = BSWAP_32(drr->drr_type);
 			drr->drr_payloadlen =
 			    BSWAP_32(drr->drr_payloadlen);
 		}
 
 		/*
 		 * At this point, the leading fields of the replay record
 		 * (drr_type and drr_payloadlen) have been byte-swapped if
 		 * necessary, but the rest of the data structure (the
 		 * union of type-specific structures) is still in its
 		 * original state.
 		 */
 		if (drr->drr_type >= DRR_NUMTYPES) {
 			(void) printf("INVALID record found: type 0x%x\n",
 			    drr->drr_type);
 			(void) printf("Aborting.\n");
 			exit(1);
 		}
 
 		drr_record_count[drr->drr_type]++;
 		total_records++;
 
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
 			if (do_byteswap) {
 				drrb->drr_magic = BSWAP_64(drrb->drr_magic);
 				drrb->drr_versioninfo =
 				    BSWAP_64(drrb->drr_versioninfo);
 				drrb->drr_creation_time =
 				    BSWAP_64(drrb->drr_creation_time);
 				drrb->drr_type = BSWAP_32(drrb->drr_type);
 				drrb->drr_flags = BSWAP_32(drrb->drr_flags);
 				drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
 				drrb->drr_fromguid =
 				    BSWAP_64(drrb->drr_fromguid);
 			}
 
 			(void) printf("BEGIN record\n");
 			(void) printf("\thdrtype = %lld\n",
 			    DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo));
 			(void) printf("\tfeatures = %llx\n",
 			    DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo));
 			(void) printf("\tmagic = %llx\n",
 			    (u_longlong_t)drrb->drr_magic);
 			(void) printf("\tcreation_time = %llx\n",
 			    (u_longlong_t)drrb->drr_creation_time);
 			(void) printf("\ttype = %u\n", drrb->drr_type);
 			(void) printf("\tflags = 0x%x\n", drrb->drr_flags);
 			(void) printf("\ttoguid = %llx\n",
 			    (u_longlong_t)drrb->drr_toguid);
 			(void) printf("\tfromguid = %llx\n",
 			    (u_longlong_t)drrb->drr_fromguid);
 			(void) printf("\ttoname = %s\n", drrb->drr_toname);
 			if (verbose)
 				(void) printf("\n");
 
 			if (drr->drr_payloadlen != 0) {
 				nvlist_t *nv;
 				int sz = drr->drr_payloadlen;
 
 				if (sz > SPA_MAXBLOCKSIZE) {
 					free(buf);
 					buf = safe_malloc(sz);
 				}
 				(void) ssread(buf, sz, &zc);
 				if (ferror(send_stream))
 					perror("fread");
 				err = nvlist_unpack(buf, sz, &nv, 0);
 				if (err)
 					perror(strerror(err));
 				nvlist_print(stdout, nv);
 				nvlist_free(nv);
 			}
 			break;
 
 		case DRR_END:
 			if (do_byteswap) {
 				drre->drr_checksum.zc_word[0] =
 				    BSWAP_64(drre->drr_checksum.zc_word[0]);
 				drre->drr_checksum.zc_word[1] =
 				    BSWAP_64(drre->drr_checksum.zc_word[1]);
 				drre->drr_checksum.zc_word[2] =
 				    BSWAP_64(drre->drr_checksum.zc_word[2]);
 				drre->drr_checksum.zc_word[3] =
 				    BSWAP_64(drre->drr_checksum.zc_word[3]);
 			}
 			/*
 			 * We compare against the *previous* checksum
 			 * value, because the stored checksum is of
 			 * everything before the DRR_END record.
 			 */
 			if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum,
 			    pcksum)) {
 				(void) printf("Expected checksum differs from "
 				    "checksum in stream.\n");
 				(void) printf("Expected checksum = "
 				    "%llx/%llx/%llx/%llx\n",
 				    pcksum.zc_word[0],
 				    pcksum.zc_word[1],
 				    pcksum.zc_word[2],
 				    pcksum.zc_word[3]);
 			}
 			(void) printf("END checksum = %llx/%llx/%llx/%llx\n",
 			    drre->drr_checksum.zc_word[0],
 			    drre->drr_checksum.zc_word[1],
 			    drre->drr_checksum.zc_word[2],
 			    drre->drr_checksum.zc_word[3]);
 
 			ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
 			break;
 
 		case DRR_OBJECT:
 			if (do_byteswap) {
 				drro->drr_object = BSWAP_64(drro->drr_object);
 				drro->drr_type = BSWAP_32(drro->drr_type);
 				drro->drr_bonustype =
 				    BSWAP_32(drro->drr_bonustype);
 				drro->drr_blksz = BSWAP_32(drro->drr_blksz);
 				drro->drr_bonuslen =
 				    BSWAP_32(drro->drr_bonuslen);
 				drro->drr_toguid = BSWAP_64(drro->drr_toguid);
 			}
 			if (verbose) {
 				(void) printf("OBJECT object = %llu type = %u "
 				    "bonustype = %u blksz = %u bonuslen = %u\n",
 				    (u_longlong_t)drro->drr_object,
 				    drro->drr_type,
 				    drro->drr_bonustype,
 				    drro->drr_blksz,
 				    drro->drr_bonuslen);
 			}
 			if (drro->drr_bonuslen > 0) {
 				(void) ssread(buf,
 				    P2ROUNDUP(drro->drr_bonuslen, 8), &zc);
 				if (dump) {
 					print_block(buf,
 					    P2ROUNDUP(drro->drr_bonuslen, 8));
 				}
 			}
 			break;
 
 		case DRR_FREEOBJECTS:
 			if (do_byteswap) {
 				drrfo->drr_firstobj =
 				    BSWAP_64(drrfo->drr_firstobj);
 				drrfo->drr_numobjs =
 				    BSWAP_64(drrfo->drr_numobjs);
 				drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid);
 			}
 			if (verbose) {
 				(void) printf("FREEOBJECTS firstobj = %llu "
 				    "numobjs = %llu\n",
 				    (u_longlong_t)drrfo->drr_firstobj,
 				    (u_longlong_t)drrfo->drr_numobjs);
 			}
 			break;
 
 		case DRR_WRITE:
 			if (do_byteswap) {
 				drrw->drr_object = BSWAP_64(drrw->drr_object);
 				drrw->drr_type = BSWAP_32(drrw->drr_type);
 				drrw->drr_offset = BSWAP_64(drrw->drr_offset);
 				drrw->drr_length = BSWAP_64(drrw->drr_length);
 				drrw->drr_toguid = BSWAP_64(drrw->drr_toguid);
 				drrw->drr_key.ddk_prop =
 				    BSWAP_64(drrw->drr_key.ddk_prop);
 			}
 			/*
 			 * If this is verbose and/or dump output,
 			 * print info on the modified block
 			 */
 			if (verbose) {
 				(void) printf("WRITE object = %llu type = %u "
 				    "checksum type = %u\n"
 				    "    offset = %llu length = %llu "
 				    "props = %llx\n",
 				    (u_longlong_t)drrw->drr_object,
 				    drrw->drr_type,
 				    drrw->drr_checksumtype,
 				    (u_longlong_t)drrw->drr_offset,
 				    (u_longlong_t)drrw->drr_length,
 				    (u_longlong_t)drrw->drr_key.ddk_prop);
 			}
 			/*
 			 * Read the contents of the block in from STDIN to buf
 			 */
 			(void) ssread(buf, drrw->drr_length, &zc);
 			/*
 			 * If in dump mode
 			 */
 			if (dump) {
 				print_block(buf, drrw->drr_length);
 			}
 			total_write_size += drrw->drr_length;
 			break;
 
 		case DRR_WRITE_BYREF:
 			if (do_byteswap) {
 				drrwbr->drr_object =
 				    BSWAP_64(drrwbr->drr_object);
 				drrwbr->drr_offset =
 				    BSWAP_64(drrwbr->drr_offset);
 				drrwbr->drr_length =
 				    BSWAP_64(drrwbr->drr_length);
 				drrwbr->drr_toguid =
 				    BSWAP_64(drrwbr->drr_toguid);
 				drrwbr->drr_refguid =
 				    BSWAP_64(drrwbr->drr_refguid);
 				drrwbr->drr_refobject =
 				    BSWAP_64(drrwbr->drr_refobject);
 				drrwbr->drr_refoffset =
 				    BSWAP_64(drrwbr->drr_refoffset);
 				drrwbr->drr_key.ddk_prop =
 				    BSWAP_64(drrwbr->drr_key.ddk_prop);
 			}
 			if (verbose) {
 				(void) printf("WRITE_BYREF object = %llu "
 				    "checksum type = %u props = %llx\n"
 				    "    offset = %llu length = %llu\n"
 				    "toguid = %llx refguid = %llx\n"
 				    "    refobject = %llu refoffset = %llu\n",
 				    (u_longlong_t)drrwbr->drr_object,
 				    drrwbr->drr_checksumtype,
 				    (u_longlong_t)drrwbr->drr_key.ddk_prop,
 				    (u_longlong_t)drrwbr->drr_offset,
 				    (u_longlong_t)drrwbr->drr_length,
 				    (u_longlong_t)drrwbr->drr_toguid,
 				    (u_longlong_t)drrwbr->drr_refguid,
 				    (u_longlong_t)drrwbr->drr_refobject,
 				    (u_longlong_t)drrwbr->drr_refoffset);
 			}
 			break;
 
 		case DRR_FREE:
 			if (do_byteswap) {
 				drrf->drr_object = BSWAP_64(drrf->drr_object);
 				drrf->drr_offset = BSWAP_64(drrf->drr_offset);
 				drrf->drr_length = BSWAP_64(drrf->drr_length);
 			}
 			if (verbose) {
 				(void) printf("FREE object = %llu "
 				    "offset = %llu length = %lld\n",
 				    (u_longlong_t)drrf->drr_object,
 				    (u_longlong_t)drrf->drr_offset,
 				    (longlong_t)drrf->drr_length);
 			}
 			break;
 		case DRR_SPILL:
 			if (do_byteswap) {
 				drrs->drr_object = BSWAP_64(drrs->drr_object);
 				drrs->drr_length = BSWAP_64(drrs->drr_length);
 			}
 			if (verbose) {
 				(void) printf("SPILL block for object = %llu "
 				    "length = %llu\n", drrs->drr_object,
 				    drrs->drr_length);
 			}
 			(void) ssread(buf, drrs->drr_length, &zc);
 			if (dump) {
 				print_block(buf, drrs->drr_length);
 			}
 			break;
 		case DRR_WRITE_EMBEDDED:
 			if (do_byteswap) {
 				drrwe->drr_object =
 				    BSWAP_64(drrwe->drr_object);
 				drrwe->drr_offset =
 				    BSWAP_64(drrwe->drr_offset);
 				drrwe->drr_length =
 				    BSWAP_64(drrwe->drr_length);
 				drrwe->drr_toguid =
 				    BSWAP_64(drrwe->drr_toguid);
 				drrwe->drr_lsize =
 				    BSWAP_32(drrwe->drr_lsize);
 				drrwe->drr_psize =
 				    BSWAP_32(drrwe->drr_psize);
 			}
 			if (verbose) {
 				(void) printf("WRITE_EMBEDDED object = %llu "
 				    "offset = %llu length = %llu\n"
 				    "    toguid = %llx comp = %u etype = %u "
 				    "lsize = %u psize = %u\n",
 				    (u_longlong_t)drrwe->drr_object,
 				    (u_longlong_t)drrwe->drr_offset,
 				    (u_longlong_t)drrwe->drr_length,
 				    (u_longlong_t)drrwe->drr_toguid,
 				    drrwe->drr_compression,
 				    drrwe->drr_etype,
 				    drrwe->drr_lsize,
 				    drrwe->drr_psize);
 			}
 			(void) ssread(buf,
 			    P2ROUNDUP(drrwe->drr_psize, 8), &zc);
 			break;
 		}
 		if (drr->drr_type != DRR_BEGIN && very_verbose) {
 			(void) printf("    checksum = %llx/%llx/%llx/%llx\n",
 			    (longlong_t)drrc->drr_checksum.zc_word[0],
 			    (longlong_t)drrc->drr_checksum.zc_word[1],
 			    (longlong_t)drrc->drr_checksum.zc_word[2],
 			    (longlong_t)drrc->drr_checksum.zc_word[3]);
 		}
 		pcksum = zc;
 	}
 	free(buf);
 
 	/* Print final summary */
 
 	(void) printf("SUMMARY:\n");
 	(void) printf("\tTotal DRR_BEGIN records = %lld\n",
 	    (u_longlong_t)drr_record_count[DRR_BEGIN]);
 	(void) printf("\tTotal DRR_END records = %lld\n",
 	    (u_longlong_t)drr_record_count[DRR_END]);
 	(void) printf("\tTotal DRR_OBJECT records = %lld\n",
 	    (u_longlong_t)drr_record_count[DRR_OBJECT]);
 	(void) printf("\tTotal DRR_FREEOBJECTS records = %lld\n",
 	    (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]);
 	(void) printf("\tTotal DRR_WRITE records = %lld\n",
 	    (u_longlong_t)drr_record_count[DRR_WRITE]);
 	(void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n",
 	    (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]);
 	(void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n",
 	    (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]);
 	(void) printf("\tTotal DRR_FREE records = %lld\n",
 	    (u_longlong_t)drr_record_count[DRR_FREE]);
 	(void) printf("\tTotal DRR_SPILL records = %lld\n",
 	    (u_longlong_t)drr_record_count[DRR_SPILL]);
 	(void) printf("\tTotal records = %lld\n",
 	    (u_longlong_t)total_records);
 	(void) printf("\tTotal write size = %lld (0x%llx)\n",
 	    (u_longlong_t)total_write_size, (u_longlong_t)total_write_size);
 	(void) printf("\tTotal stream length = %lld (0x%llx)\n",
 	    (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len);
 	return (0);
 }
Index: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	(revision 296518)
+++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	(revision 296519)
@@ -1,6325 +1,6326 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>.  All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /*
  * The objective of this program is to provide a DMU/ZAP/SPA stress test
  * that runs entirely in userland, is easy to use, and easy to extend.
  *
  * The overall design of the ztest program is as follows:
  *
  * (1) For each major functional area (e.g. adding vdevs to a pool,
  *     creating and destroying datasets, reading and writing objects, etc)
  *     we have a simple routine to test that functionality.  These
  *     individual routines do not have to do anything "stressful".
  *
  * (2) We turn these simple functionality tests into a stress test by
  *     running them all in parallel, with as many threads as desired,
  *     and spread across as many datasets, objects, and vdevs as desired.
  *
  * (3) While all this is happening, we inject faults into the pool to
  *     verify that self-healing data really works.
  *
  * (4) Every time we open a dataset, we change its checksum and compression
  *     functions.  Thus even individual objects vary from block to block
  *     in which checksum they use and whether they're compressed.
  *
  * (5) To verify that we never lose on-disk consistency after a crash,
  *     we run the entire test in a child of the main process.
  *     At random times, the child self-immolates with a SIGKILL.
  *     This is the software equivalent of pulling the power cord.
  *     The parent then runs the test again, using the existing
  *     storage pool, as many times as desired. If backwards compatibility
  *     testing is enabled ztest will sometimes run the "older" version
  *     of ztest after a SIGKILL.
  *
  * (6) To verify that we don't have future leaks or temporal incursions,
  *     many of the functional tests record the transaction group number
  *     as part of their data.  When reading old data, they verify that
  *     the transaction group number is less than the current, open txg.
  *     If you add a new test, please do this if applicable.
  *
  * When run with no arguments, ztest runs for about five minutes and
  * produces no output if successful.  To get a little bit of information,
  * specify -V.  To get more information, specify -VV, and so on.
  *
  * To turn this into an overnight stress test, use -T to specify run time.
  *
  * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
  * to increase the pool capacity, fanout, and overall stress level.
  *
  * Use the -k option to set the desired frequency of kills.
  *
  * When ztest invokes itself it passes all relevant information through a
  * temporary file which is mmap-ed in the child process. This allows shared
  * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
  * stored at offset 0 of this file and contains information on the size and
  * number of shared structures in the file. The information stored in this file
  * must remain backwards compatible with older versions of ztest so that
  * ztest can invoke them during backwards compatibility testing (-B).
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/dmu_objset.h>
 #include <sys/poll.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/spa_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_scan.h>
 #include <sys/zio_checksum.h>
 #include <sys/refcount.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_userhold.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <signal.h>
 #include <umem.h>
 #include <dlfcn.h>
 #include <ctype.h>
 #include <math.h>
 #include <errno.h>
 #include <sys/fs/zfs.h>
 #include <libnvpair.h>
 
 static int ztest_fd_data = -1;
 static int ztest_fd_rand = -1;
 
 typedef struct ztest_shared_hdr {
 	uint64_t	zh_hdr_size;
 	uint64_t	zh_opts_size;
 	uint64_t	zh_size;
 	uint64_t	zh_stats_size;
 	uint64_t	zh_stats_count;
 	uint64_t	zh_ds_size;
 	uint64_t	zh_ds_count;
 } ztest_shared_hdr_t;
 
 static ztest_shared_hdr_t *ztest_shared_hdr;
 
 typedef struct ztest_shared_opts {
 	char zo_pool[MAXNAMELEN];
 	char zo_dir[MAXNAMELEN];
 	char zo_alt_ztest[MAXNAMELEN];
 	char zo_alt_libpath[MAXNAMELEN];
 	uint64_t zo_vdevs;
 	uint64_t zo_vdevtime;
 	size_t zo_vdev_size;
 	int zo_ashift;
 	int zo_mirrors;
 	int zo_raidz;
 	int zo_raidz_parity;
 	int zo_datasets;
 	int zo_threads;
 	uint64_t zo_passtime;
 	uint64_t zo_killrate;
 	int zo_verbose;
 	int zo_init;
 	uint64_t zo_time;
 	uint64_t zo_maxloops;
 	uint64_t zo_metaslab_gang_bang;
 } ztest_shared_opts_t;
 
 static const ztest_shared_opts_t ztest_opts_defaults = {
 	.zo_pool = { 'z', 't', 'e', 's', 't', '\0' },
 	.zo_dir = { '/', 't', 'm', 'p', '\0' },
 	.zo_alt_ztest = { '\0' },
 	.zo_alt_libpath = { '\0' },
 	.zo_vdevs = 5,
 	.zo_ashift = SPA_MINBLOCKSHIFT,
 	.zo_mirrors = 2,
 	.zo_raidz = 4,
 	.zo_raidz_parity = 1,
 	.zo_vdev_size = SPA_MINDEVSIZE * 2,
 	.zo_datasets = 7,
 	.zo_threads = 23,
 	.zo_passtime = 60,		/* 60 seconds */
 	.zo_killrate = 70,		/* 70% kill rate */
 	.zo_verbose = 0,
 	.zo_init = 1,
 	.zo_time = 300,			/* 5 minutes */
 	.zo_maxloops = 50,		/* max loops during spa_freeze() */
 	.zo_metaslab_gang_bang = 32 << 10
 };
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
 extern uint64_t zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
 
 typedef struct ztest_shared_ds {
 	uint64_t	zd_seq;
 } ztest_shared_ds_t;
 
 static ztest_shared_ds_t *ztest_shared_ds;
 #define	ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
 
 #define	BT_MAGIC	0x123456789abcdefULL
 #define	MAXFAULTS() \
 	(MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
 
 enum ztest_io_type {
 	ZTEST_IO_WRITE_TAG,
 	ZTEST_IO_WRITE_PATTERN,
 	ZTEST_IO_WRITE_ZEROES,
 	ZTEST_IO_TRUNCATE,
 	ZTEST_IO_SETATTR,
 	ZTEST_IO_REWRITE,
 	ZTEST_IO_TYPES
 };
 
 typedef struct ztest_block_tag {
 	uint64_t	bt_magic;
 	uint64_t	bt_objset;
 	uint64_t	bt_object;
 	uint64_t	bt_offset;
 	uint64_t	bt_gen;
 	uint64_t	bt_txg;
 	uint64_t	bt_crtxg;
 } ztest_block_tag_t;
 
 typedef struct bufwad {
 	uint64_t	bw_index;
 	uint64_t	bw_txg;
 	uint64_t	bw_data;
 } bufwad_t;
 
 /*
  * XXX -- fix zfs range locks to be generic so we can use them here.
  */
 typedef enum {
 	RL_READER,
 	RL_WRITER,
 	RL_APPEND
 } rl_type_t;
 
 typedef struct rll {
 	void		*rll_writer;
 	int		rll_readers;
 	mutex_t		rll_lock;
 	cond_t		rll_cv;
 } rll_t;
 
 typedef struct rl {
 	uint64_t	rl_object;
 	uint64_t	rl_offset;
 	uint64_t	rl_size;
 	rll_t		*rl_lock;
 } rl_t;
 
 #define	ZTEST_RANGE_LOCKS	64
 #define	ZTEST_OBJECT_LOCKS	64
 
 /*
  * Object descriptor.  Used as a template for object lookup/create/remove.
  */
 typedef struct ztest_od {
 	uint64_t	od_dir;
 	uint64_t	od_object;
 	dmu_object_type_t od_type;
 	dmu_object_type_t od_crtype;
 	uint64_t	od_blocksize;
 	uint64_t	od_crblocksize;
 	uint64_t	od_gen;
 	uint64_t	od_crgen;
 	char		od_name[MAXNAMELEN];
 } ztest_od_t;
 
 /*
  * Per-dataset state.
  */
 typedef struct ztest_ds {
 	ztest_shared_ds_t *zd_shared;
 	objset_t	*zd_os;
 	rwlock_t	zd_zilog_lock;
 	zilog_t		*zd_zilog;
 	ztest_od_t	*zd_od;		/* debugging aid */
 	char		zd_name[MAXNAMELEN];
 	mutex_t		zd_dirobj_lock;
 	rll_t		zd_object_lock[ZTEST_OBJECT_LOCKS];
 	rll_t		zd_range_lock[ZTEST_RANGE_LOCKS];
 } ztest_ds_t;
 
 /*
  * Per-iteration state.
  */
 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
 
 typedef struct ztest_info {
 	ztest_func_t	*zi_func;	/* test function */
 	uint64_t	zi_iters;	/* iterations per execution */
 	uint64_t	*zi_interval;	/* execute every <interval> seconds */
 } ztest_info_t;
 
 typedef struct ztest_shared_callstate {
 	uint64_t	zc_count;	/* per-pass count */
 	uint64_t	zc_time;	/* per-pass time */
 	uint64_t	zc_next;	/* next time to call this function */
 } ztest_shared_callstate_t;
 
 static ztest_shared_callstate_t *ztest_shared_callstate;
 #define	ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])
 
 /*
  * Note: these aren't static because we want dladdr() to work.
  */
 ztest_func_t ztest_dmu_read_write;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
 ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
 ztest_func_t ztest_zil_commit;
 ztest_func_t ztest_zil_remount;
 ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_objset_create_destroy;
 ztest_func_t ztest_dmu_prealloc;
 ztest_func_t ztest_fzap;
 ztest_func_t ztest_dmu_snapshot_create_destroy;
 ztest_func_t ztest_dsl_prop_get_set;
 ztest_func_t ztest_spa_prop_get_set;
 ztest_func_t ztest_spa_create_destroy;
 ztest_func_t ztest_fault_inject;
 ztest_func_t ztest_ddt_repair;
 ztest_func_t ztest_dmu_snapshot_hold;
 ztest_func_t ztest_spa_rename;
 ztest_func_t ztest_scrub;
 ztest_func_t ztest_dsl_dataset_promote_busy;
 ztest_func_t ztest_vdev_attach_detach;
 ztest_func_t ztest_vdev_LUN_growth;
 ztest_func_t ztest_vdev_add_remove;
 ztest_func_t ztest_vdev_aux_add_remove;
 ztest_func_t ztest_split_pool;
 ztest_func_t ztest_reguid;
 ztest_func_t ztest_spa_upgrade;
 
 uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
 uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
 uint64_t zopt_often = 1ULL * NANOSEC;		/* every second */
 uint64_t zopt_sometimes = 10ULL * NANOSEC;	/* every 10 seconds */
 uint64_t zopt_rarely = 60ULL * NANOSEC;		/* every 60 seconds */
 
 ztest_info_t ztest_info[] = {
 	{ ztest_dmu_read_write,			1,	&zopt_always	},
 	{ ztest_dmu_write_parallel,		10,	&zopt_always	},
 	{ ztest_dmu_object_alloc_free,		1,	&zopt_always	},
 	{ ztest_dmu_commit_callbacks,		1,	&zopt_always	},
 	{ ztest_zap,				30,	&zopt_always	},
 	{ ztest_zap_parallel,			100,	&zopt_always	},
 	{ ztest_split_pool,			1,	&zopt_always	},
 	{ ztest_zil_commit,			1,	&zopt_incessant	},
 	{ ztest_zil_remount,			1,	&zopt_sometimes	},
 	{ ztest_dmu_read_write_zcopy,		1,	&zopt_often	},
 	{ ztest_dmu_objset_create_destroy,	1,	&zopt_often	},
 	{ ztest_dsl_prop_get_set,		1,	&zopt_often	},
 	{ ztest_spa_prop_get_set,		1,	&zopt_sometimes	},
 #if 0
 	{ ztest_dmu_prealloc,			1,	&zopt_sometimes	},
 #endif
 	{ ztest_fzap,				1,	&zopt_sometimes	},
 	{ ztest_dmu_snapshot_create_destroy,	1,	&zopt_sometimes	},
 	{ ztest_spa_create_destroy,		1,	&zopt_sometimes	},
 	{ ztest_fault_inject,			1,	&zopt_sometimes	},
 	{ ztest_ddt_repair,			1,	&zopt_sometimes	},
 	{ ztest_dmu_snapshot_hold,		1,	&zopt_sometimes	},
 	{ ztest_reguid,				1,	&zopt_rarely	},
 	{ ztest_spa_rename,			1,	&zopt_rarely	},
 	{ ztest_scrub,				1,	&zopt_rarely	},
 	{ ztest_spa_upgrade,			1,	&zopt_rarely	},
 	{ ztest_dsl_dataset_promote_busy,	1,	&zopt_rarely	},
 	{ ztest_vdev_attach_detach,		1,	&zopt_sometimes	},
 	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely	},
 	{ ztest_vdev_add_remove,		1,
 	    &ztest_opts.zo_vdevtime				},
 	{ ztest_vdev_aux_add_remove,		1,
 	    &ztest_opts.zo_vdevtime				},
 };
 
 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
 
 /*
  * The following struct is used to hold a list of uncalled commit callbacks.
  * The callbacks are ordered by txg number.
  */
 typedef struct ztest_cb_list {
 	mutex_t	zcl_callbacks_lock;
 	list_t	zcl_callbacks;
 } ztest_cb_list_t;
 
 /*
  * Stuff we need to share writably between parent and child.
  */
 typedef struct ztest_shared {
 	boolean_t	zs_do_init;
 	hrtime_t	zs_proc_start;
 	hrtime_t	zs_proc_stop;
 	hrtime_t	zs_thread_start;
 	hrtime_t	zs_thread_stop;
 	hrtime_t	zs_thread_kill;
 	uint64_t	zs_enospc_count;
 	uint64_t	zs_vdev_next_leaf;
 	uint64_t	zs_vdev_aux;
 	uint64_t	zs_alloc;
 	uint64_t	zs_space;
 	uint64_t	zs_splits;
 	uint64_t	zs_mirrors;
 	uint64_t	zs_metaslab_sz;
 	uint64_t	zs_metaslab_df_alloc_threshold;
 	uint64_t	zs_guid;
 } ztest_shared_t;
 
 #define	ID_PARALLEL	-1ULL
 
 static char ztest_dev_template[] = "%s/%s.%llua";
 static char ztest_aux_template[] = "%s/%s.%s.%llu";
 ztest_shared_t *ztest_shared;
 
 static spa_t *ztest_spa = NULL;
 static ztest_ds_t *ztest_ds;
 
 static mutex_t ztest_vdev_lock;
 
 /*
  * The ztest_name_lock protects the pool and dataset namespace used by
  * the individual tests. To modify the namespace, consumers must grab
  * this lock as writer. Grabbing the lock as reader will ensure that the
  * namespace does not change while the lock is held.
  */
 static rwlock_t ztest_name_lock;
 
 static boolean_t ztest_dump_core = B_TRUE;
 static boolean_t ztest_exiting;
 
 /* Global commit callback list */
 static ztest_cb_list_t zcl;
 
 enum ztest_object {
 	ZTEST_META_DNODE = 0,
 	ZTEST_DIROBJ,
 	ZTEST_OBJECTS
 };
 
 static void usage(boolean_t) __NORETURN;
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init()
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 #define	FATAL_MSG_SZ	1024
 
 char *fatal_msg;
 
 static void
 fatal(int do_perror, char *message, ...)
 {
 	va_list args;
 	int save_errno = errno;
 	char buf[FATAL_MSG_SZ];
 
 	(void) fflush(stdout);
 
 	va_start(args, message);
 	(void) sprintf(buf, "ztest: ");
 	/* LINTED */
 	(void) vsprintf(buf + strlen(buf), message, args);
 	va_end(args);
 	if (do_perror) {
 		(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
 		    ": %s", strerror(save_errno));
 	}
 	(void) fprintf(stderr, "%s\n", buf);
 	fatal_msg = buf;			/* to ease debugging */
 	if (ztest_dump_core)
 		abort();
 	exit(3);
 }
 
 static int
 str2shift(const char *buf)
 {
 	const char *ends = "BKMGTPEZ";
 	int i;
 
 	if (buf[0] == '\0')
 		return (0);
 	for (i = 0; i < strlen(ends); i++) {
 		if (toupper(buf[0]) == ends[i])
 			break;
 	}
 	if (i == strlen(ends)) {
 		(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
 		    buf);
 		usage(B_FALSE);
 	}
 	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
 		return (10*i);
 	}
 	(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
 	usage(B_FALSE);
 	/* NOTREACHED */
 }
 
 static uint64_t
 nicenumtoull(const char *buf)
 {
 	char *end;
 	uint64_t val;
 
 	val = strtoull(buf, &end, 0);
 	if (end == buf) {
 		(void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
 		usage(B_FALSE);
 	} else if (end[0] == '.') {
 		double fval = strtod(buf, &end);
 		fval *= pow(2, str2shift(end));
 		if (fval > UINT64_MAX) {
 			(void) fprintf(stderr, "ztest: value too large: %s\n",
 			    buf);
 			usage(B_FALSE);
 		}
 		val = (uint64_t)fval;
 	} else {
 		int shift = str2shift(end);
 		if (shift >= 64 || (val << shift) >> shift != val) {
 			(void) fprintf(stderr, "ztest: value too large: %s\n",
 			    buf);
 			usage(B_FALSE);
 		}
 		val <<= shift;
 	}
 	return (val);
 }
 
 static void
 usage(boolean_t requested)
 {
 	const ztest_shared_opts_t *zo = &ztest_opts_defaults;
 
 	char nice_vdev_size[10];
 	char nice_gang_bang[10];
 	FILE *fp = requested ? stdout : stderr;
 
 	nicenum(zo->zo_vdev_size, nice_vdev_size);
 	nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang);
 
 	(void) fprintf(fp, "Usage: %s\n"
 	    "\t[-v vdevs (default: %llu)]\n"
 	    "\t[-s size_of_each_vdev (default: %s)]\n"
 	    "\t[-a alignment_shift (default: %d)] use 0 for random\n"
 	    "\t[-m mirror_copies (default: %d)]\n"
 	    "\t[-r raidz_disks (default: %d)]\n"
 	    "\t[-R raidz_parity (default: %d)]\n"
 	    "\t[-d datasets (default: %d)]\n"
 	    "\t[-t threads (default: %d)]\n"
 	    "\t[-g gang_block_threshold (default: %s)]\n"
 	    "\t[-i init_count (default: %d)] initialize pool i times\n"
 	    "\t[-k kill_percentage (default: %llu%%)]\n"
 	    "\t[-p pool_name (default: %s)]\n"
 	    "\t[-f dir (default: %s)] file directory for vdev files\n"
 	    "\t[-V] verbose (use multiple times for ever more blather)\n"
 	    "\t[-E] use existing pool instead of creating new one\n"
 	    "\t[-T time (default: %llu sec)] total run time\n"
 	    "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
 	    "\t[-P passtime (default: %llu sec)] time per pass\n"
 	    "\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
 	    "\t[-h] (print help)\n"
 	    "",
 	    zo->zo_pool,
 	    (u_longlong_t)zo->zo_vdevs,			/* -v */
 	    nice_vdev_size,				/* -s */
 	    zo->zo_ashift,				/* -a */
 	    zo->zo_mirrors,				/* -m */
 	    zo->zo_raidz,				/* -r */
 	    zo->zo_raidz_parity,			/* -R */
 	    zo->zo_datasets,				/* -d */
 	    zo->zo_threads,				/* -t */
 	    nice_gang_bang,				/* -g */
 	    zo->zo_init,				/* -i */
 	    (u_longlong_t)zo->zo_killrate,		/* -k */
 	    zo->zo_pool,				/* -p */
 	    zo->zo_dir,					/* -f */
 	    (u_longlong_t)zo->zo_time,			/* -T */
 	    (u_longlong_t)zo->zo_maxloops,		/* -F */
 	    (u_longlong_t)zo->zo_passtime);
 	exit(requested ? 0 : 1);
 }
 
 static void
 process_options(int argc, char **argv)
 {
 	char *path;
 	ztest_shared_opts_t *zo = &ztest_opts;
 
 	int opt;
 	uint64_t value;
 	char altdir[MAXNAMELEN] = { 0 };
 
 	bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
 
 	while ((opt = getopt(argc, argv,
 	    "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:")) != EOF) {
 		value = 0;
 		switch (opt) {
 		case 'v':
 		case 's':
 		case 'a':
 		case 'm':
 		case 'r':
 		case 'R':
 		case 'd':
 		case 't':
 		case 'g':
 		case 'i':
 		case 'k':
 		case 'T':
 		case 'P':
 		case 'F':
 			value = nicenumtoull(optarg);
 		}
 		switch (opt) {
 		case 'v':
 			zo->zo_vdevs = value;
 			break;
 		case 's':
 			zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
 			break;
 		case 'a':
 			zo->zo_ashift = value;
 			break;
 		case 'm':
 			zo->zo_mirrors = value;
 			break;
 		case 'r':
 			zo->zo_raidz = MAX(1, value);
 			break;
 		case 'R':
 			zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
 			break;
 		case 'd':
 			zo->zo_datasets = MAX(1, value);
 			break;
 		case 't':
 			zo->zo_threads = MAX(1, value);
 			break;
 		case 'g':
 			zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1,
 			    value);
 			break;
 		case 'i':
 			zo->zo_init = value;
 			break;
 		case 'k':
 			zo->zo_killrate = value;
 			break;
 		case 'p':
 			(void) strlcpy(zo->zo_pool, optarg,
 			    sizeof (zo->zo_pool));
 			break;
 		case 'f':
 			path = realpath(optarg, NULL);
 			if (path == NULL) {
 				(void) fprintf(stderr, "error: %s: %s\n",
 				    optarg, strerror(errno));
 				usage(B_FALSE);
 			} else {
 				(void) strlcpy(zo->zo_dir, path,
 				    sizeof (zo->zo_dir));
 			}
 			break;
 		case 'V':
 			zo->zo_verbose++;
 			break;
 		case 'E':
 			zo->zo_init = 0;
 			break;
 		case 'T':
 			zo->zo_time = value;
 			break;
 		case 'P':
 			zo->zo_passtime = MAX(1, value);
 			break;
 		case 'F':
 			zo->zo_maxloops = MAX(1, value);
 			break;
 		case 'B':
 			(void) strlcpy(altdir, optarg, sizeof (altdir));
 			break;
 		case 'h':
 			usage(B_TRUE);
 			break;
 		case '?':
 		default:
 			usage(B_FALSE);
 			break;
 		}
 	}
 
 	zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
 
 	zo->zo_vdevtime =
 	    (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
 	    UINT64_MAX >> 2);
 
 	if (strlen(altdir) > 0) {
 		char *cmd;
 		char *realaltdir;
 		char *bin;
 		char *ztest;
 		char *isa;
 		int isalen;
 
 		cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 		realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 
 		VERIFY(NULL != realpath(getexecname(), cmd));
 		if (0 != access(altdir, F_OK)) {
 			ztest_dump_core = B_FALSE;
 			fatal(B_TRUE, "invalid alternate ztest path: %s",
 			    altdir);
 		}
 		VERIFY(NULL != realpath(altdir, realaltdir));
 
 		/*
 		 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
 		 * We want to extract <isa> to determine if we should use
 		 * 32 or 64 bit binaries.
 		 */
 		bin = strstr(cmd, "/usr/bin/");
 		ztest = strstr(bin, "/ztest");
 		isa = bin + 9;
 		isalen = ztest - isa;
 		(void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
 		    "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
 		(void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
 		    "%s/usr/lib/%.*s", realaltdir, isalen, isa);
 
 		if (0 != access(zo->zo_alt_ztest, X_OK)) {
 			ztest_dump_core = B_FALSE;
 			fatal(B_TRUE, "invalid alternate ztest: %s",
 			    zo->zo_alt_ztest);
 		} else if (0 != access(zo->zo_alt_libpath, X_OK)) {
 			ztest_dump_core = B_FALSE;
 			fatal(B_TRUE, "invalid alternate lib directory %s",
 			    zo->zo_alt_libpath);
 		}
 
 		umem_free(cmd, MAXPATHLEN);
 		umem_free(realaltdir, MAXPATHLEN);
 	}
 }
 
 static void
 ztest_kill(ztest_shared_t *zs)
 {
 	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
 	zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
 
 	/*
 	 * Before we kill off ztest, make sure that the config is updated.
 	 * See comment above spa_config_sync().
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa_config_sync(ztest_spa, B_FALSE, B_FALSE);
 	mutex_exit(&spa_namespace_lock);
 
 	zfs_dbgmsg_print(FTAG);
 	(void) kill(getpid(), SIGKILL);
 }
 
 static uint64_t
 ztest_random(uint64_t range)
 {
 	uint64_t r;
 
 	ASSERT3S(ztest_fd_rand, >=, 0);
 
 	if (range == 0)
 		return (0);
 
 	if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
 		fatal(1, "short read from /dev/urandom");
 
 	return (r % range);
 }
 
 /* ARGSUSED */
 static void
 ztest_record_enospc(const char *s)
 {
 	ztest_shared->zs_enospc_count++;
 }
 
 static uint64_t
 ztest_get_ashift(void)
 {
 	if (ztest_opts.zo_ashift == 0)
 		return (SPA_MINBLOCKSHIFT + ztest_random(5));
 	return (ztest_opts.zo_ashift);
 }
 
 static nvlist_t *
 make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
 {
 	char pathbuf[MAXPATHLEN];
 	uint64_t vdev;
 	nvlist_t *file;
 
 	if (ashift == 0)
 		ashift = ztest_get_ashift();
 
 	if (path == NULL) {
 		path = pathbuf;
 
 		if (aux != NULL) {
 			vdev = ztest_shared->zs_vdev_aux;
 			(void) snprintf(path, sizeof (pathbuf),
 			    ztest_aux_template, ztest_opts.zo_dir,
 			    pool == NULL ? ztest_opts.zo_pool : pool,
 			    aux, vdev);
 		} else {
 			vdev = ztest_shared->zs_vdev_next_leaf++;
 			(void) snprintf(path, sizeof (pathbuf),
 			    ztest_dev_template, ztest_opts.zo_dir,
 			    pool == NULL ? ztest_opts.zo_pool : pool, vdev);
 		}
 	}
 
 	if (size != 0) {
 		int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
 		if (fd == -1)
 			fatal(1, "can't open %s", path);
 		if (ftruncate(fd, size) != 0)
 			fatal(1, "can't ftruncate %s", path);
 		(void) close(fd);
 	}
 
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
 	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	return (file);
 }
 
 static nvlist_t *
 make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
     uint64_t ashift, int r)
 {
 	nvlist_t *raidz, **child;
 	int c;
 
 	if (r < 2)
 		return (make_vdev_file(path, aux, pool, size, ashift));
 	child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < r; c++)
 		child[c] = make_vdev_file(path, aux, pool, size, ashift);
 
 	VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_RAIDZ) == 0);
 	VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
 	    ztest_opts.zo_raidz_parity) == 0);
 	VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
 	    child, r) == 0);
 
 	for (c = 0; c < r; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, r * sizeof (nvlist_t *));
 
 	return (raidz);
 }
 
 static nvlist_t *
 make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
     uint64_t ashift, int r, int m)
 {
 	nvlist_t *mirror, **child;
 	int c;
 
 	if (m < 1)
 		return (make_vdev_raidz(path, aux, pool, size, ashift, r));
 
 	child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < m; c++)
 		child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
 
 	VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_MIRROR) == 0);
 	VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
 	    child, m) == 0);
 
 	for (c = 0; c < m; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, m * sizeof (nvlist_t *));
 
 	return (mirror);
 }
 
 static nvlist_t *
 make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
     int log, int r, int m, int t)
 {
 	nvlist_t *root, **child;
 	int c;
 
 	ASSERT(t > 0);
 
 	child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < t; c++) {
 		child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
 		    r, m);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 		    log) == 0);
 	}
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
 	    child, t) == 0);
 
 	for (c = 0; c < t; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, t * sizeof (nvlist_t *));
 
 	return (root);
 }
 
 /*
  * Find a random spa version. Returns back a random spa version in the
  * range [initial_version, SPA_VERSION_FEATURES].
  */
 static uint64_t
 ztest_random_spa_version(uint64_t initial_version)
 {
 	uint64_t version = initial_version;
 
 	if (version <= SPA_VERSION_BEFORE_FEATURES) {
 		version = version +
 		    ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
 	}
 
 	if (version > SPA_VERSION_BEFORE_FEATURES)
 		version = SPA_VERSION_FEATURES;
 
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	return (version);
 }
 
 static int
 ztest_random_blocksize(void)
 {
 	uint64_t block_shift;
 	/*
 	 * Choose a block size >= the ashift.
 	 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
 	 */
 	int maxbs = SPA_OLD_MAXBLOCKSHIFT;
 	if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
 		maxbs = 20;
 	block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1);
 	return (1 << (SPA_MINBLOCKSHIFT + block_shift));
 }
 
 static int
 ztest_random_ibshift(void)
 {
 	return (DN_MIN_INDBLKSHIFT +
 	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
 }
 
 static uint64_t
 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
 {
 	uint64_t top;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *tvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	do {
 		top = ztest_random(rvd->vdev_children);
 		tvd = rvd->vdev_child[top];
 	} while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
 	    tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
 
 	return (top);
 }
 
 static uint64_t
 ztest_random_dsl_prop(zfs_prop_t prop)
 {
 	uint64_t value;
 
 	do {
 		value = zfs_prop_random_value(prop, ztest_random(-1ULL));
 	} while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
 
 	return (value);
 }
 
 static int
 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
     boolean_t inherit)
 {
 	const char *propname = zfs_prop_to_name(prop);
 	const char *valname;
 	char setpoint[MAXPATHLEN];
 	uint64_t curval;
 	int error;
 
 	error = dsl_prop_set_int(osname, propname,
 	    (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
 
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
 		return (error);
 	}
 	ASSERT0(error);
 
 	VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
 
 	if (ztest_opts.zo_verbose >= 6) {
 		VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
 		(void) printf("%s %s = %s at '%s'\n",
 		    osname, propname, valname, setpoint);
 	}
 
 	return (error);
 }
 
 static int
 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
 {
 	spa_t *spa = ztest_spa;
 	nvlist_t *props = NULL;
 	int error;
 
 	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
 
 	error = spa_prop_set(spa, props);
 
 	nvlist_free(props);
 
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
 		return (error);
 	}
 	ASSERT0(error);
 
 	return (error);
 }
 
 static void
 ztest_rll_init(rll_t *rll)
 {
 	rll->rll_writer = NULL;
 	rll->rll_readers = 0;
 	VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
 	VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
 }
 
 static void
 ztest_rll_destroy(rll_t *rll)
 {
 	ASSERT(rll->rll_writer == NULL);
 	ASSERT(rll->rll_readers == 0);
 	VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
 	VERIFY(cond_destroy(&rll->rll_cv) == 0);
 }
 
 static void
 ztest_rll_lock(rll_t *rll, rl_type_t type)
 {
 	VERIFY(mutex_lock(&rll->rll_lock) == 0);
 
 	if (type == RL_READER) {
 		while (rll->rll_writer != NULL)
 			(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
 		rll->rll_readers++;
 	} else {
 		while (rll->rll_writer != NULL || rll->rll_readers)
 			(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
 		rll->rll_writer = curthread;
 	}
 
 	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
 }
 
 static void
 ztest_rll_unlock(rll_t *rll)
 {
 	VERIFY(mutex_lock(&rll->rll_lock) == 0);
 
 	if (rll->rll_writer) {
 		ASSERT(rll->rll_readers == 0);
 		rll->rll_writer = NULL;
 	} else {
 		ASSERT(rll->rll_readers != 0);
 		ASSERT(rll->rll_writer == NULL);
 		rll->rll_readers--;
 	}
 
 	if (rll->rll_writer == NULL && rll->rll_readers == 0)
 		VERIFY(cond_broadcast(&rll->rll_cv) == 0);
 
 	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
 }
 
 static void
 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
 {
 	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
 
 	ztest_rll_lock(rll, type);
 }
 
 static void
 ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
 {
 	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
 
 	ztest_rll_unlock(rll);
 }
 
 static rl_t *
 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
     uint64_t size, rl_type_t type)
 {
 	uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
 	rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
 	rl_t *rl;
 
 	rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
 	rl->rl_object = object;
 	rl->rl_offset = offset;
 	rl->rl_size = size;
 	rl->rl_lock = rll;
 
 	ztest_rll_lock(rll, type);
 
 	return (rl);
 }
 
 static void
 ztest_range_unlock(rl_t *rl)
 {
 	rll_t *rll = rl->rl_lock;
 
 	ztest_rll_unlock(rll);
 
 	umem_free(rl, sizeof (*rl));
 }
 
 static void
 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
 {
 	zd->zd_os = os;
 	zd->zd_zilog = dmu_objset_zil(os);
 	zd->zd_shared = szd;
 	dmu_objset_name(os, zd->zd_name);
 
 	if (zd->zd_shared != NULL)
 		zd->zd_shared->zd_seq = 0;
 
 	VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0);
 	VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
 
 	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
 		ztest_rll_init(&zd->zd_object_lock[l]);
 
 	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
 		ztest_rll_init(&zd->zd_range_lock[l]);
 }
 
 static void
 ztest_zd_fini(ztest_ds_t *zd)
 {
 	VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
 
 	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
 		ztest_rll_destroy(&zd->zd_object_lock[l]);
 
 	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
 		ztest_rll_destroy(&zd->zd_range_lock[l]);
 }
 
 #define	TXG_MIGHTWAIT	(ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
 
 static uint64_t
 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
 {
 	uint64_t txg;
 	int error;
 
 	/*
 	 * Attempt to assign tx to some transaction group.
 	 */
 	error = dmu_tx_assign(tx, txg_how);
 	if (error) {
 		if (error == ERESTART) {
 			ASSERT(txg_how == TXG_NOWAIT);
 			dmu_tx_wait(tx);
 		} else {
 			ASSERT3U(error, ==, ENOSPC);
 			ztest_record_enospc(tag);
 		}
 		dmu_tx_abort(tx);
 		return (0);
 	}
 	txg = dmu_tx_get_txg(tx);
 	ASSERT(txg != 0);
 	return (txg);
 }
 
 static void
 ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
 {
 	uint64_t *ip = buf;
 	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
 
 	while (ip < ip_end)
 		*ip++ = value;
 }
 
 static boolean_t
 ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
 {
 	uint64_t *ip = buf;
 	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
 	uint64_t diff = 0;
 
 	while (ip < ip_end)
 		diff |= (value - *ip++);
 
 	return (diff == 0);
 }
 
 static void
 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
     uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
 {
 	bt->bt_magic = BT_MAGIC;
 	bt->bt_objset = dmu_objset_id(os);
 	bt->bt_object = object;
 	bt->bt_offset = offset;
 	bt->bt_gen = gen;
 	bt->bt_txg = txg;
 	bt->bt_crtxg = crtxg;
 }
 
 static void
 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
     uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
 {
 	ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
 	ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
 	ASSERT3U(bt->bt_object, ==, object);
 	ASSERT3U(bt->bt_offset, ==, offset);
 	ASSERT3U(bt->bt_gen, <=, gen);
 	ASSERT3U(bt->bt_txg, <=, txg);
 	ASSERT3U(bt->bt_crtxg, ==, crtxg);
 }
 
 static ztest_block_tag_t *
 ztest_bt_bonus(dmu_buf_t *db)
 {
 	dmu_object_info_t doi;
 	ztest_block_tag_t *bt;
 
 	dmu_object_info_from_db(db, &doi);
 	ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
 	ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
 	bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
 
 	return (bt);
 }
 
 /*
  * ZIL logging ops
  */
 
 #define	lrz_type	lr_mode
 #define	lrz_blocksize	lr_uid
 #define	lrz_ibshift	lr_gid
 #define	lrz_bonustype	lr_rdev
 #define	lrz_bonuslen	lr_crtime[1]
 
 static void
 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
 {
 	char *name = (void *)(lr + 1);		/* name follows lr */
 	size_t namesize = strlen(name) + 1;
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) + namesize - sizeof (lr_t));
 
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 static void
 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
 {
 	char *name = (void *)(lr + 1);		/* name follows lr */
 	size_t namesize = strlen(name) + 1;
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) + namesize - sizeof (lr_t));
 
 	itx->itx_oid = object;
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 static void
 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
 {
 	itx_t *itx;
 	itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	if (lr->lr_length > ZIL_MAX_LOG_DATA)
 		write_state = WR_INDIRECT;
 
 	itx = zil_itx_create(TX_WRITE,
 	    sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
 
 	if (write_state == WR_COPIED &&
 	    dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
 	    ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
 		zil_itx_destroy(itx);
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 		write_state = WR_NEED_COPY;
 	}
 	itx->itx_private = zd;
 	itx->itx_wr_state = write_state;
 	itx->itx_sync = (ztest_random(8) == 0);
 	itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
 
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
 
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 static void
 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
 {
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
 
 	itx->itx_sync = B_FALSE;
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 static void
 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
 {
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
 
 	itx->itx_sync = B_FALSE;
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 /*
  * ZIL replay ops
  */
 static int
 ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
 {
 	char *name = (void *)(lr + 1);		/* name follows lr */
 	objset_t *os = zd->zd_os;
 	ztest_block_tag_t *bbt;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	int error = 0;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
 	ASSERT(name[0] != '\0');
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
 
 	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 	} else {
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 	}
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0)
 		return (ENOSPC);
 
 	ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
 
 	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
 		if (lr->lr_foid == 0) {
 			lr->lr_foid = zap_create(os,
 			    lr->lrz_type, lr->lrz_bonustype,
 			    lr->lrz_bonuslen, tx);
 		} else {
 			error = zap_create_claim(os, lr->lr_foid,
 			    lr->lrz_type, lr->lrz_bonustype,
 			    lr->lrz_bonuslen, tx);
 		}
 	} else {
 		if (lr->lr_foid == 0) {
 			lr->lr_foid = dmu_object_alloc(os,
 			    lr->lrz_type, 0, lr->lrz_bonustype,
 			    lr->lrz_bonuslen, tx);
 		} else {
 			error = dmu_object_claim(os, lr->lr_foid,
 			    lr->lrz_type, 0, lr->lrz_bonustype,
 			    lr->lrz_bonuslen, tx);
 		}
 	}
 
 	if (error) {
 		ASSERT3U(error, ==, EEXIST);
 		ASSERT(zd->zd_zilog->zl_replay);
 		dmu_tx_commit(tx);
 		return (error);
 	}
 
 	ASSERT(lr->lr_foid != 0);
 
 	if (lr->lrz_type != DMU_OT_ZAP_OTHER)
 		VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
 		    lr->lrz_blocksize, lr->lrz_ibshift, tx));
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
 	bbt = ztest_bt_bonus(db);
 	dmu_buf_will_dirty(db, tx);
 	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
 	dmu_buf_rele(db, FTAG);
 
 	VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
 	    &lr->lr_foid, tx));
 
 	(void) ztest_log_create(zd, tx, lr);
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 static int
 ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
 {
 	char *name = (void *)(lr + 1);		/* name follows lr */
 	objset_t *os = zd->zd_os;
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
 	uint64_t object, txg;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
 	ASSERT(name[0] != '\0');
 
 	VERIFY3U(0, ==,
 	    zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
 	ASSERT(object != 0);
 
 	ztest_object_lock(zd, object, RL_WRITER);
 
 	VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		ztest_object_unlock(zd, object);
 		return (ENOSPC);
 	}
 
 	if (doi.doi_type == DMU_OT_ZAP_OTHER) {
 		VERIFY3U(0, ==, zap_destroy(os, object, tx));
 	} else {
 		VERIFY3U(0, ==, dmu_object_free(os, object, tx));
 	}
 
 	VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
 
 	(void) ztest_log_remove(zd, tx, lr, object);
 
 	dmu_tx_commit(tx);
 
 	ztest_object_unlock(zd, object);
 
 	return (0);
 }
 
 static int
 ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
 {
 	objset_t *os = zd->zd_os;
 	void *data = lr + 1;			/* data follows lr */
 	uint64_t offset, length;
 	ztest_block_tag_t *bt = data;
 	ztest_block_tag_t *bbt;
 	uint64_t gen, txg, lrtxg, crtxg;
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
 	dmu_buf_t *db;
 	arc_buf_t *abuf = NULL;
 	rl_t *rl;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 		if (length < blocksize) {
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
 	}
 
 	if (bt->bt_magic == BSWAP_64(BT_MAGIC))
 		byteswap_uint64_array(bt, sizeof (*bt));
 
 	if (bt->bt_magic != BT_MAGIC)
 		bt = NULL;
 
 	ztest_object_lock(zd, lr->lr_foid, RL_READER);
 	rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
 
 	dmu_object_info_from_db(db, &doi);
 
 	bbt = ztest_bt_bonus(db);
 	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
 	gen = bbt->bt_gen;
 	crtxg = bbt->bt_crtxg;
 	lrtxg = lr->lr_common.lrc_txg;
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
 
 	if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
 	    P2PHASE(offset, length) == 0)
 		abuf = dmu_request_arcbuf(db, length);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		if (abuf != NULL)
 			dmu_return_arcbuf(abuf);
 		dmu_buf_rele(db, FTAG);
 		ztest_range_unlock(rl);
 		ztest_object_unlock(zd, lr->lr_foid);
 		return (ENOSPC);
 	}
 
 	if (bt != NULL) {
 		/*
 		 * Usually, verify the old data before writing new data --
 		 * but not always, because we also want to verify correct
 		 * behavior when the data was not recently read into cache.
 		 */
 		ASSERT(offset % doi.doi_data_block_size == 0);
 		if (ztest_random(4) != 0) {
 			int prefetch = ztest_random(2) ?
 			    DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
 			ztest_block_tag_t rbt;
 
 			VERIFY(dmu_read(os, lr->lr_foid, offset,
 			    sizeof (rbt), &rbt, prefetch) == 0);
 			if (rbt.bt_magic == BT_MAGIC) {
 				ztest_bt_verify(&rbt, os, lr->lr_foid,
 				    offset, gen, txg, crtxg);
 			}
 		}
 
 		/*
 		 * Writes can appear to be newer than the bonus buffer because
 		 * the ztest_get_data() callback does a dmu_read() of the
 		 * open-context data, which may be different than the data
 		 * as it was when the write was generated.
 		 */
 		if (zd->zd_zilog->zl_replay) {
 			ztest_bt_verify(bt, os, lr->lr_foid, offset,
 			    MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
 			    bt->bt_crtxg);
 		}
 
 		/*
 		 * Set the bt's gen/txg to the bonus buffer's gen/txg
 		 * so that all of the usual ASSERTs will work.
 		 */
 		ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
 	}
 
 	if (abuf == NULL) {
 		dmu_write(os, lr->lr_foid, offset, length, data, tx);
 	} else {
 		bcopy(data, abuf->b_data, length);
 		dmu_assign_arcbuf(db, offset, abuf, tx);
 	}
 
 	(void) ztest_log_write(zd, tx, lr);
 
 	dmu_buf_rele(db, FTAG);
 
 	dmu_tx_commit(tx);
 
 	ztest_range_unlock(rl);
 	ztest_object_unlock(zd, lr->lr_foid);
 
 	return (0);
 }
 
 static int
 ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
 {
 	objset_t *os = zd->zd_os;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	rl_t *rl;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	ztest_object_lock(zd, lr->lr_foid, RL_READER);
 	rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
 	    RL_WRITER);
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		ztest_range_unlock(rl);
 		ztest_object_unlock(zd, lr->lr_foid);
 		return (ENOSPC);
 	}
 
 	VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
 	    lr->lr_length, tx) == 0);
 
 	(void) ztest_log_truncate(zd, tx, lr);
 
 	dmu_tx_commit(tx);
 
 	ztest_range_unlock(rl);
 	ztest_object_unlock(zd, lr->lr_foid);
 
 	return (0);
 }
 
 static int
 ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
 {
 	objset_t *os = zd->zd_os;
 	dmu_tx_t *tx;
 	dmu_buf_t *db;
 	ztest_block_tag_t *bbt;
 	uint64_t txg, lrtxg, crtxg;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, lr->lr_foid);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		dmu_buf_rele(db, FTAG);
 		ztest_object_unlock(zd, lr->lr_foid);
 		return (ENOSPC);
 	}
 
 	bbt = ztest_bt_bonus(db);
 	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
 	crtxg = bbt->bt_crtxg;
 	lrtxg = lr->lr_common.lrc_txg;
 
 	if (zd->zd_zilog->zl_replay) {
 		ASSERT(lr->lr_size != 0);
 		ASSERT(lr->lr_mode != 0);
 		ASSERT(lrtxg != 0);
 	} else {
 		/*
 		 * Randomly change the size and increment the generation.
 		 */
 		lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
 		    sizeof (*bbt);
 		lr->lr_mode = bbt->bt_gen + 1;
 		ASSERT(lrtxg == 0);
 	}
 
 	/*
 	 * Verify that the current bonus buffer is not newer than our txg.
 	 */
 	ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
 	    MAX(txg, lrtxg), crtxg);
 
 	dmu_buf_will_dirty(db, tx);
 
 	ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
 	ASSERT3U(lr->lr_size, <=, db->db_size);
 	VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
 	bbt = ztest_bt_bonus(db);
 
 	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
 
 	dmu_buf_rele(db, FTAG);
 
 	(void) ztest_log_setattr(zd, tx, lr);
 
 	dmu_tx_commit(tx);
 
 	ztest_object_unlock(zd, lr->lr_foid);
 
 	return (0);
 }
 
 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
 	NULL,			/* 0 no such transaction type */
 	ztest_replay_create,	/* TX_CREATE */
 	NULL,			/* TX_MKDIR */
 	NULL,			/* TX_MKXATTR */
 	NULL,			/* TX_SYMLINK */
 	ztest_replay_remove,	/* TX_REMOVE */
 	NULL,			/* TX_RMDIR */
 	NULL,			/* TX_LINK */
 	NULL,			/* TX_RENAME */
 	ztest_replay_write,	/* TX_WRITE */
 	ztest_replay_truncate,	/* TX_TRUNCATE */
 	ztest_replay_setattr,	/* TX_SETATTR */
 	NULL,			/* TX_ACL */
 	NULL,			/* TX_CREATE_ACL */
 	NULL,			/* TX_CREATE_ATTR */
 	NULL,			/* TX_CREATE_ACL_ATTR */
 	NULL,			/* TX_MKDIR_ACL */
 	NULL,			/* TX_MKDIR_ATTR */
 	NULL,			/* TX_MKDIR_ACL_ATTR */
 	NULL,			/* TX_WRITE2 */
 };
 
 /*
  * ZIL get_data callbacks
  */
 
 static void
 ztest_get_done(zgd_t *zgd, int error)
 {
 	ztest_ds_t *zd = zgd->zgd_private;
 	uint64_t object = zgd->zgd_rl->rl_object;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	ztest_range_unlock(zgd->zgd_rl);
 	ztest_object_unlock(zd, object);
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 
 	umem_free(zgd, sizeof (*zgd));
 }
 
 static int
 ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	ztest_ds_t *zd = arg;
 	objset_t *os = zd->zd_os;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	blkptr_t *bp = &lr->lr_blkptr;
 	uint64_t txg = lr->lr_common.lrc_txg;
 	uint64_t crtxg;
 	dmu_object_info_t doi;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error;
 
 	ztest_object_lock(zd, object, RL_READER);
 	error = dmu_bonus_hold(os, object, FTAG, &db);
 	if (error) {
 		ztest_object_unlock(zd, object);
 		return (error);
 	}
 
 	crtxg = ztest_bt_bonus(db)->bt_crtxg;
 
 	if (crtxg == 0 || crtxg > txg) {
 		dmu_buf_rele(db, FTAG);
 		ztest_object_unlock(zd, object);
 		return (ENOENT);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	dmu_buf_rele(db, FTAG);
 	db = NULL;
 
 	zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
 	zgd->zgd_zilog = zd->zd_zilog;
 	zgd->zgd_private = zd;
 
 	if (buf != NULL) {	/* immediate write */
 		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
 		    RL_READER);
 
 		error = dmu_read(os, object, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 		ASSERT(error == 0);
 	} else {
 		size = doi.doi_data_block_size;
 		if (ISP2(size)) {
 			offset = P2ALIGN(offset, size);
 		} else {
 			ASSERT(offset < size);
 			offset = 0;
 		}
 
 		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
 		    RL_READER);
 
 		error = dmu_buf_hold(os, object, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
 			blkptr_t *obp = dmu_buf_get_blkptr(db);
 			if (obp) {
 				ASSERT(BP_IS_HOLE(bp));
 				*bp = *obp;
 			}
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    ztest_get_done, zgd);
 
 			if (error == 0)
 				return (0);
 		}
 	}
 
 	ztest_get_done(zgd, error);
 
 	return (error);
 }
 
 static void *
 ztest_lr_alloc(size_t lrsize, char *name)
 {
 	char *lr;
 	size_t namesize = name ? strlen(name) + 1 : 0;
 
 	lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
 
 	if (name)
 		bcopy(name, lr + lrsize, namesize);
 
 	return (lr);
 }
 
 void
 ztest_lr_free(void *lr, size_t lrsize, char *name)
 {
 	size_t namesize = name ? strlen(name) + 1 : 0;
 
 	umem_free(lr, lrsize + namesize);
 }
 
 /*
  * Lookup a bunch of objects.  Returns the number of objects not found.
  */
 static int
 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
 {
 	int missing = 0;
 	int error;
 
 	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
 
 	for (int i = 0; i < count; i++, od++) {
 		od->od_object = 0;
 		error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
 		    sizeof (uint64_t), 1, &od->od_object);
 		if (error) {
 			ASSERT(error == ENOENT);
 			ASSERT(od->od_object == 0);
 			missing++;
 		} else {
 			dmu_buf_t *db;
 			ztest_block_tag_t *bbt;
 			dmu_object_info_t doi;
 
 			ASSERT(od->od_object != 0);
 			ASSERT(missing == 0);	/* there should be no gaps */
 
 			ztest_object_lock(zd, od->od_object, RL_READER);
 			VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
 			    od->od_object, FTAG, &db));
 			dmu_object_info_from_db(db, &doi);
 			bbt = ztest_bt_bonus(db);
 			ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
 			od->od_type = doi.doi_type;
 			od->od_blocksize = doi.doi_data_block_size;
 			od->od_gen = bbt->bt_gen;
 			dmu_buf_rele(db, FTAG);
 			ztest_object_unlock(zd, od->od_object);
 		}
 	}
 
 	return (missing);
 }
 
 static int
 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
 {
 	int missing = 0;
 
 	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
 
 	for (int i = 0; i < count; i++, od++) {
 		if (missing) {
 			od->od_object = 0;
 			missing++;
 			continue;
 		}
 
 		lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
 
 		lr->lr_doid = od->od_dir;
 		lr->lr_foid = 0;	/* 0 to allocate, > 0 to claim */
 		lr->lrz_type = od->od_crtype;
 		lr->lrz_blocksize = od->od_crblocksize;
 		lr->lrz_ibshift = ztest_random_ibshift();
 		lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
 		lr->lrz_bonuslen = dmu_bonus_max();
 		lr->lr_gen = od->od_crgen;
 		lr->lr_crtime[0] = time(NULL);
 
 		if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
 			ASSERT(missing == 0);
 			od->od_object = 0;
 			missing++;
 		} else {
 			od->od_object = lr->lr_foid;
 			od->od_type = od->od_crtype;
 			od->od_blocksize = od->od_crblocksize;
 			od->od_gen = od->od_crgen;
 			ASSERT(od->od_object != 0);
 		}
 
 		ztest_lr_free(lr, sizeof (*lr), od->od_name);
 	}
 
 	return (missing);
 }
 
 static int
 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
 {
 	int missing = 0;
 	int error;
 
 	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
 
 	od += count - 1;
 
 	for (int i = count - 1; i >= 0; i--, od--) {
 		if (missing) {
 			missing++;
 			continue;
 		}
 
 		/*
 		 * No object was found.
 		 */
 		if (od->od_object == 0)
 			continue;
 
 		lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
 
 		lr->lr_doid = od->od_dir;
 
 		if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
 			ASSERT3U(error, ==, ENOSPC);
 			missing++;
 		} else {
 			od->od_object = 0;
 		}
 		ztest_lr_free(lr, sizeof (*lr), od->od_name);
 	}
 
 	return (missing);
 }
 
 static int
 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
     void *data)
 {
 	lr_write_t *lr;
 	int error;
 
 	lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
 
 	lr->lr_foid = object;
 	lr->lr_offset = offset;
 	lr->lr_length = size;
 	lr->lr_blkoff = 0;
 	BP_ZERO(&lr->lr_blkptr);
 
 	bcopy(data, lr + 1, size);
 
 	error = ztest_replay_write(zd, lr, B_FALSE);
 
 	ztest_lr_free(lr, sizeof (*lr) + size, NULL);
 
 	return (error);
 }
 
 static int
 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
 {
 	lr_truncate_t *lr;
 	int error;
 
 	lr = ztest_lr_alloc(sizeof (*lr), NULL);
 
 	lr->lr_foid = object;
 	lr->lr_offset = offset;
 	lr->lr_length = size;
 
 	error = ztest_replay_truncate(zd, lr, B_FALSE);
 
 	ztest_lr_free(lr, sizeof (*lr), NULL);
 
 	return (error);
 }
 
 static int
 ztest_setattr(ztest_ds_t *zd, uint64_t object)
 {
 	lr_setattr_t *lr;
 	int error;
 
 	lr = ztest_lr_alloc(sizeof (*lr), NULL);
 
 	lr->lr_foid = object;
 	lr->lr_size = 0;
 	lr->lr_mode = 0;
 
 	error = ztest_replay_setattr(zd, lr, B_FALSE);
 
 	ztest_lr_free(lr, sizeof (*lr), NULL);
 
 	return (error);
 }
 
 static void
 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
 {
 	objset_t *os = zd->zd_os;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	rl_t *rl;
 
 	txg_wait_synced(dmu_objset_pool(os), 0);
 
 	ztest_object_lock(zd, object, RL_READER);
 	rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, object, offset, size);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 
 	if (txg != 0) {
 		dmu_prealloc(os, object, offset, size, tx);
 		dmu_tx_commit(tx);
 		txg_wait_synced(dmu_objset_pool(os), txg);
 	} else {
 		(void) dmu_free_long_range(os, object, offset, size);
 	}
 
 	ztest_range_unlock(rl);
 	ztest_object_unlock(zd, object);
 }
 
 static void
 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
 {
 	int err;
 	ztest_block_tag_t wbt;
 	dmu_object_info_t doi;
 	enum ztest_io_type io_type;
 	uint64_t blocksize;
 	void *data;
 
 	VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
 	blocksize = doi.doi_data_block_size;
 	data = umem_alloc(blocksize, UMEM_NOFAIL);
 
 	/*
 	 * Pick an i/o type at random, biased toward writing block tags.
 	 */
 	io_type = ztest_random(ZTEST_IO_TYPES);
 	if (ztest_random(2) == 0)
 		io_type = ZTEST_IO_WRITE_TAG;
 
 	(void) rw_rdlock(&zd->zd_zilog_lock);
 
 	switch (io_type) {
 
 	case ZTEST_IO_WRITE_TAG:
 		ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
 		(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
 		break;
 
 	case ZTEST_IO_WRITE_PATTERN:
 		(void) memset(data, 'a' + (object + offset) % 5, blocksize);
 		if (ztest_random(2) == 0) {
 			/*
 			 * Induce fletcher2 collisions to ensure that
 			 * zio_ddt_collision() detects and resolves them
 			 * when using fletcher2-verify for deduplication.
 			 */
 			((uint64_t *)data)[0] ^= 1ULL << 63;
 			((uint64_t *)data)[4] ^= 1ULL << 63;
 		}
 		(void) ztest_write(zd, object, offset, blocksize, data);
 		break;
 
 	case ZTEST_IO_WRITE_ZEROES:
 		bzero(data, blocksize);
 		(void) ztest_write(zd, object, offset, blocksize, data);
 		break;
 
 	case ZTEST_IO_TRUNCATE:
 		(void) ztest_truncate(zd, object, offset, blocksize);
 		break;
 
 	case ZTEST_IO_SETATTR:
 		(void) ztest_setattr(zd, object);
 		break;
 
 	case ZTEST_IO_REWRITE:
 		(void) rw_rdlock(&ztest_name_lock);
 		err = ztest_dsl_prop_set_uint64(zd->zd_name,
 		    ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
 		    B_FALSE);
 		VERIFY(err == 0 || err == ENOSPC);
 		err = ztest_dsl_prop_set_uint64(zd->zd_name,
 		    ZFS_PROP_COMPRESSION,
 		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
 		    B_FALSE);
 		VERIFY(err == 0 || err == ENOSPC);
 		(void) rw_unlock(&ztest_name_lock);
 
 		VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
 		    DMU_READ_NO_PREFETCH));
 
 		(void) ztest_write(zd, object, offset, blocksize, data);
 		break;
 	}
 
 	(void) rw_unlock(&zd->zd_zilog_lock);
 
 	umem_free(data, blocksize);
 }
 
 /*
  * Initialize an object description template.
  */
 static void
 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
     dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
 {
 	od->od_dir = ZTEST_DIROBJ;
 	od->od_object = 0;
 
 	od->od_crtype = type;
 	od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
 	od->od_crgen = gen;
 
 	od->od_type = DMU_OT_NONE;
 	od->od_blocksize = 0;
 	od->od_gen = 0;
 
 	(void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
 	    tag, (int64_t)id, index);
 }
 
 /*
  * Lookup or create the objects for a test using the od template.
  * If the objects do not all exist, or if 'remove' is specified,
  * remove any existing objects and create new ones.  Otherwise,
  * use the existing objects.
  */
 static int
 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
 {
 	int count = size / sizeof (*od);
 	int rv = 0;
 
 	VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
 	if ((ztest_lookup(zd, od, count) != 0 || remove) &&
 	    (ztest_remove(zd, od, count) != 0 ||
 	    ztest_create(zd, od, count) != 0))
 		rv = -1;
 	zd->zd_od = od;
 	VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
 
 	return (rv);
 }
 
 /* ARGSUSED */
 void
 ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
 {
 	zilog_t *zilog = zd->zd_zilog;
 
 	(void) rw_rdlock(&zd->zd_zilog_lock);
 
 	zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
 
 	/*
 	 * Remember the committed values in zd, which is in parent/child
 	 * shared memory.  If we die, the next iteration of ztest_run()
 	 * will verify that the log really does contain this record.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zd->zd_shared != NULL);
 	ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
 	zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
 	mutex_exit(&zilog->zl_lock);
 
 	(void) rw_unlock(&zd->zd_zilog_lock);
 }
 
 /*
  * This function is designed to simulate the operations that occur during a
  * mount/unmount operation.  We hold the dataset across these operations in an
  * attempt to expose any implicit assumptions about ZIL management.
  */
 /* ARGSUSED */
 void
 ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 
 	/*
 	 * We grab the zd_dirobj_lock to ensure that no other thread is
 	 * updating the zil (i.e. adding in-memory log records) and the
 	 * zd_zilog_lock to block any I/O.
 	 */
 	VERIFY0(mutex_lock(&zd->zd_dirobj_lock));
 	(void) rw_wrlock(&zd->zd_zilog_lock);
 
 	/* zfsvfs_teardown() */
 	zil_close(zd->zd_zilog);
 
 	/* zfsvfs_setup() */
 	VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
 	zil_replay(os, zd, ztest_replay_vector);
 
 	(void) rw_unlock(&zd->zd_zilog_lock);
 	VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
 }
 
 /*
  * Verify that we can't destroy an active pool, create an existing pool,
  * or create a pool with a bad vdev spec.
  */
 /* ARGSUSED */
 void
 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_opts_t *zo = &ztest_opts;
 	spa_t *spa;
 	nvlist_t *nvroot;
 
 	/*
 	 * Attempt to create using a bad file.
 	 */
 	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
 	VERIFY3U(ENOENT, ==,
 	    spa_create("ztest_bad_file", nvroot, NULL, NULL));
 	nvlist_free(nvroot);
 
 	/*
 	 * Attempt to create using a bad mirror.
 	 */
 	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
 	VERIFY3U(ENOENT, ==,
 	    spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
 	nvlist_free(nvroot);
 
 	/*
 	 * Attempt to create an existing pool.  It shouldn't matter
 	 * what's in the nvroot; we should fail with EEXIST.
 	 */
 	(void) rw_rdlock(&ztest_name_lock);
 	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
 	VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
 	nvlist_free(nvroot);
 	VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
 	VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
 	spa_close(spa, FTAG);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /* ARGSUSED */
 void
 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa;
 	uint64_t initial_version = SPA_VERSION_INITIAL;
 	uint64_t version, newversion;
 	nvlist_t *nvroot, *props;
 	char *name;
 
 	VERIFY0(mutex_lock(&ztest_vdev_lock));
 	name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
 
 	/*
 	 * Clean up from previous runs.
 	 */
 	(void) spa_destroy(name);
 
 	nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
 	    0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
 
 	/*
 	 * If we're configuring a RAIDZ device then make sure that the
 	 * the initial version is capable of supporting that feature.
 	 */
 	switch (ztest_opts.zo_raidz_parity) {
 	case 0:
 	case 1:
 		initial_version = SPA_VERSION_INITIAL;
 		break;
 	case 2:
 		initial_version = SPA_VERSION_RAIDZ2;
 		break;
 	case 3:
 		initial_version = SPA_VERSION_RAIDZ3;
 		break;
 	}
 
 	/*
 	 * Create a pool with a spa version that can be upgraded. Pick
 	 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
 	 */
 	do {
 		version = ztest_random_spa_version(initial_version);
 	} while (version > SPA_VERSION_BEFORE_FEATURES);
 
 	props = fnvlist_alloc();
 	fnvlist_add_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
 	VERIFY0(spa_create(name, nvroot, props, NULL));
 	fnvlist_free(nvroot);
 	fnvlist_free(props);
 
 	VERIFY0(spa_open(name, &spa, FTAG));
 	VERIFY3U(spa_version(spa), ==, version);
 	newversion = ztest_random_spa_version(version + 1);
 
 	if (ztest_opts.zo_verbose >= 4) {
 		(void) printf("upgrading spa version from %llu to %llu\n",
 		    (u_longlong_t)version, (u_longlong_t)newversion);
 	}
 
 	spa_upgrade(spa, newversion);
 	VERIFY3U(spa_version(spa), >, version);
 	VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION)));
 	spa_close(spa, FTAG);
 
 	strfree(name);
 	VERIFY0(mutex_unlock(&ztest_vdev_lock));
 }
 
 static vdev_t *
 vdev_lookup_by_path(vdev_t *vd, const char *path)
 {
 	vdev_t *mvd;
 
 	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 /*
  * Find the first available hole which can be used as a top-level.
  */
 int
 find_vdev_hole(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
 
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *cvd = rvd->vdev_child[c];
 
 		if (cvd->vdev_ishole)
 			break;
 	}
 	return (c);
 }
 
 /*
  * Verify that vdev_add() works as expected.
  */
 /* ARGSUSED */
 void
 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	uint64_t leaves;
 	uint64_t guid;
 	nvlist_t *nvroot;
 	int error;
 
 	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
 
 	/*
 	 * If we have slogs then remove them 1/4 of the time.
 	 */
 	if (spa_has_slogs(spa) && ztest_random(4) == 0) {
 		/*
 		 * Grab the guid from the head of the log class rotor.
 		 */
 		guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
 
 		spa_config_exit(spa, SCL_VDEV, FTAG);
 
 		/*
 		 * We have to grab the zs_name_lock as writer to
 		 * prevent a race between removing a slog (dmu_objset_find)
 		 * and destroying a dataset. Removing the slog will
 		 * grab a reference on the dataset which may cause
 		 * dmu_objset_destroy() to fail with EBUSY thus
 		 * leaving the dataset in an inconsistent state.
 		 */
 		VERIFY(rw_wrlock(&ztest_name_lock) == 0);
 		error = spa_vdev_remove(spa, guid, B_FALSE);
 		VERIFY(rw_unlock(&ztest_name_lock) == 0);
 
 		if (error && error != EEXIST)
 			fatal(0, "spa_vdev_remove() = %d", error);
 	} else {
 		spa_config_exit(spa, SCL_VDEV, FTAG);
 
 		/*
 		 * Make 1/4 of the devices be log devices.
 		 */
 		nvroot = make_vdev_root(NULL, NULL, NULL,
 		    ztest_opts.zo_vdev_size, 0,
 		    ztest_random(4) == 0, ztest_opts.zo_raidz,
 		    zs->zs_mirrors, 1);
 
 		error = spa_vdev_add(spa, nvroot);
 		nvlist_free(nvroot);
 
 		if (error == ENOSPC)
 			ztest_record_enospc("spa_vdev_add");
 		else if (error != 0)
 			fatal(0, "spa_vdev_add() = %d", error);
 	}
 
 	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 }
 
 /*
  * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
  */
 /* ARGSUSED */
 void
 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	spa_aux_vdev_t *sav;
 	char *aux;
 	uint64_t guid = 0;
 	int error;
 
 	if (ztest_random(2) == 0) {
 		sav = &spa->spa_spares;
 		aux = ZPOOL_CONFIG_SPARES;
 	} else {
 		sav = &spa->spa_l2cache;
 		aux = ZPOOL_CONFIG_L2CACHE;
 	}
 
 	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	if (sav->sav_count != 0 && ztest_random(4) == 0) {
 		/*
 		 * Pick a random device to remove.
 		 */
 		guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
 	} else {
 		/*
 		 * Find an unused device we can add.
 		 */
 		zs->zs_vdev_aux = 0;
 		for (;;) {
 			char path[MAXPATHLEN];
 			int c;
 			(void) snprintf(path, sizeof (path), ztest_aux_template,
 			    ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
 			    zs->zs_vdev_aux);
 			for (c = 0; c < sav->sav_count; c++)
 				if (strcmp(sav->sav_vdevs[c]->vdev_path,
 				    path) == 0)
 					break;
 			if (c == sav->sav_count &&
 			    vdev_lookup_by_path(rvd, path) == NULL)
 				break;
 			zs->zs_vdev_aux++;
 		}
 	}
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	if (guid == 0) {
 		/*
 		 * Add a new device.
 		 */
 		nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
 		    (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
 		error = spa_vdev_add(spa, nvroot);
 		if (error != 0)
 			fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
 		nvlist_free(nvroot);
 	} else {
 		/*
 		 * Remove an existing device.  Sometimes, dirty its
 		 * vdev state first to make sure we handle removal
 		 * of devices that have pending state changes.
 		 */
 		if (ztest_random(2) == 0)
 			(void) vdev_online(spa, guid, 0, NULL);
 
 		error = spa_vdev_remove(spa, guid, B_FALSE);
 		if (error != 0 && error != EBUSY)
 			fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
 	}
 
 	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 }
 
 /*
  * split a pool if it has mirror tlvdevs
  */
 /* ARGSUSED */
 void
 ztest_split_pool(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	nvlist_t *tree, **child, *config, *split, **schild;
 	uint_t c, children, schildren = 0, lastlogid = 0;
 	int error = 0;
 
 	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 
 	/* ensure we have a useable config; mirrors of raidz aren't supported */
 	if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
 		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 
 	/* clean up the old pool, if any */
 	(void) spa_destroy("splitp");
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	/* generate a config from the existing config */
 	mutex_enter(&spa->spa_props_lock);
 	VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
 	    &tree) == 0);
 	mutex_exit(&spa->spa_props_lock);
 
 	VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) == 0);
 
 	schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
 	for (c = 0; c < children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		nvlist_t **mchild;
 		uint_t mchildren;
 
 		if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
 			VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
 			    0) == 0);
 			VERIFY(nvlist_add_string(schild[schildren],
 			    ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
 			VERIFY(nvlist_add_uint64(schild[schildren],
 			    ZPOOL_CONFIG_IS_HOLE, 1) == 0);
 			if (lastlogid == 0)
 				lastlogid = schildren;
 			++schildren;
 			continue;
 		}
 		lastlogid = 0;
 		VERIFY(nvlist_lookup_nvlist_array(child[c],
 		    ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
 		VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
 	}
 
 	/* OK, create a config that can be used to split */
 	VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
 	    lastlogid != 0 ? lastlogid : schildren) == 0);
 
 	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
 
 	for (c = 0; c < schildren; c++)
 		nvlist_free(schild[c]);
 	free(schild);
 	nvlist_free(split);
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	(void) rw_wrlock(&ztest_name_lock);
 	error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
 	(void) rw_unlock(&ztest_name_lock);
 
 	nvlist_free(config);
 
 	if (error == 0) {
 		(void) printf("successful split - results:\n");
 		mutex_enter(&spa_namespace_lock);
 		show_pool_stats(spa);
 		show_pool_stats(spa_lookup("splitp"));
 		mutex_exit(&spa_namespace_lock);
 		++zs->zs_splits;
 		--zs->zs_mirrors;
 	}
 	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 
 }
 
 /*
  * Verify that we can attach and detach devices.
  */
 /* ARGSUSED */
 void
 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *pvd;
 	nvlist_t *root;
 	uint64_t leaves;
 	uint64_t leaf, top;
 	uint64_t ashift = ztest_get_ashift();
 	uint64_t oldguid, pguid;
 	uint64_t oldsize, newsize;
 	char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
 	int replacing;
 	int oldvd_has_siblings = B_FALSE;
 	int newvd_is_spare = B_FALSE;
 	int oldvd_is_log;
 	int error, expected_error;
 
 	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	/*
 	 * Decide whether to do an attach or a replace.
 	 */
 	replacing = ztest_random(2);
 
 	/*
 	 * Pick a random top-level vdev.
 	 */
 	top = ztest_random_vdev_top(spa, B_TRUE);
 
 	/*
 	 * Pick a random leaf within it.
 	 */
 	leaf = ztest_random(leaves);
 
 	/*
 	 * Locate this vdev.
 	 */
 	oldvd = rvd->vdev_child[top];
 	if (zs->zs_mirrors >= 1) {
 		ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
 		ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
 		oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
 	}
 	if (ztest_opts.zo_raidz > 1) {
 		ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
 		ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
 		oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
 	}
 
 	/*
 	 * If we're already doing an attach or replace, oldvd may be a
 	 * mirror vdev -- in which case, pick a random child.
 	 */
 	while (oldvd->vdev_children != 0) {
 		oldvd_has_siblings = B_TRUE;
 		ASSERT(oldvd->vdev_children >= 2);
 		oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
 	}
 
 	oldguid = oldvd->vdev_guid;
 	oldsize = vdev_get_min_asize(oldvd);
 	oldvd_is_log = oldvd->vdev_top->vdev_islog;
 	(void) strcpy(oldpath, oldvd->vdev_path);
 	pvd = oldvd->vdev_parent;
 	pguid = pvd->vdev_guid;
 
 	/*
 	 * If oldvd has siblings, then half of the time, detach it.
 	 */
 	if (oldvd_has_siblings && ztest_random(2) == 0) {
 		spa_config_exit(spa, SCL_VDEV, FTAG);
 		error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
 		if (error != 0 && error != ENODEV && error != EBUSY &&
 		    error != ENOTSUP)
 			fatal(0, "detach (%s) returned %d", oldpath, error);
 		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 
 	/*
 	 * For the new vdev, choose with equal probability between the two
 	 * standard paths (ending in either 'a' or 'b') or a random hot spare.
 	 */
 	if (sav->sav_count != 0 && ztest_random(3) == 0) {
 		newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
 		newvd_is_spare = B_TRUE;
 		(void) strcpy(newpath, newvd->vdev_path);
 	} else {
 		(void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
 		    ztest_opts.zo_dir, ztest_opts.zo_pool,
 		    top * leaves + leaf);
 		if (ztest_random(2) == 0)
 			newpath[strlen(newpath) - 1] = 'b';
 		newvd = vdev_lookup_by_path(rvd, newpath);
 	}
 
 	if (newvd) {
 		newsize = vdev_get_min_asize(newvd);
 	} else {
 		/*
 		 * Make newsize a little bigger or smaller than oldsize.
 		 * If it's smaller, the attach should fail.
 		 * If it's larger, and we're doing a replace,
 		 * we should get dynamic LUN growth when we're done.
 		 */
 		newsize = 10 * oldsize / (9 + ztest_random(3));
 	}
 
 	/*
 	 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
 	 * unless it's a replace; in that case any non-replacing parent is OK.
 	 *
 	 * If newvd is already part of the pool, it should fail with EBUSY.
 	 *
 	 * If newvd is too small, it should fail with EOVERFLOW.
 	 */
 	if (pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_root_ops && (!replacing ||
 	    pvd->vdev_ops == &vdev_replacing_ops ||
 	    pvd->vdev_ops == &vdev_spare_ops))
 		expected_error = ENOTSUP;
 	else if (newvd_is_spare && (!replacing || oldvd_is_log))
 		expected_error = ENOTSUP;
 	else if (newvd == oldvd)
 		expected_error = replacing ? 0 : EBUSY;
 	else if (vdev_lookup_by_path(rvd, newpath) != NULL)
 		expected_error = EBUSY;
 	else if (newsize < oldsize)
 		expected_error = EOVERFLOW;
 	else if (ashift > oldvd->vdev_top->vdev_ashift)
 		expected_error = EDOM;
 	else
 		expected_error = 0;
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	/*
 	 * Build the nvlist describing newpath.
 	 */
 	root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
 	    ashift, 0, 0, 0, 1);
 
 	error = spa_vdev_attach(spa, oldguid, root, replacing);
 
 	nvlist_free(root);
 
 	/*
 	 * If our parent was the replacing vdev, but the replace completed,
 	 * then instead of failing with ENOTSUP we may either succeed,
 	 * fail with ENODEV, or fail with EOVERFLOW.
 	 */
 	if (expected_error == ENOTSUP &&
 	    (error == 0 || error == ENODEV || error == EOVERFLOW))
 		expected_error = error;
 
 	/*
 	 * If someone grew the LUN, the replacement may be too small.
 	 */
 	if (error == EOVERFLOW || error == EBUSY)
 		expected_error = error;
 
 	/* XXX workaround 6690467 */
 	if (error != expected_error && expected_error != EBUSY) {
 		fatal(0, "attach (%s %llu, %s %llu, %d) "
 		    "returned %d, expected %d",
 		    oldpath, oldsize, newpath,
 		    newsize, replacing, error, expected_error);
 	}
 
 	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 }
 
 /*
  * Callback function which expands the physical size of the vdev.
  */
 vdev_t *
 grow_vdev(vdev_t *vd, void *arg)
 {
 	spa_t *spa = vd->vdev_spa;
 	size_t *newsize = arg;
 	size_t fsize;
 	int fd;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
 		return (vd);
 
 	fsize = lseek(fd, 0, SEEK_END);
 	(void) ftruncate(fd, *newsize);
 
 	if (ztest_opts.zo_verbose >= 6) {
 		(void) printf("%s grew from %lu to %lu bytes\n",
 		    vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
 	}
 	(void) close(fd);
 	return (NULL);
 }
 
 /*
  * Callback function which expands a given vdev by calling vdev_online().
  */
 /* ARGSUSED */
 vdev_t *
 online_vdev(vdev_t *vd, void *arg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint64_t guid = vd->vdev_guid;
 	uint64_t generation = spa->spa_config_generation + 1;
 	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/* Calling vdev_online will initialize the new metaslabs */
 	spa_config_exit(spa, SCL_STATE, spa);
 	error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	/*
 	 * If vdev_online returned an error or the underlying vdev_open
 	 * failed then we abort the expand. The only way to know that
 	 * vdev_open fails is by checking the returned newstate.
 	 */
 	if (error || newstate != VDEV_STATE_HEALTHY) {
 		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("Unable to expand vdev, state %llu, "
 			    "error %d\n", (u_longlong_t)newstate, error);
 		}
 		return (vd);
 	}
 	ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
 
 	/*
 	 * Since we dropped the lock we need to ensure that we're
 	 * still talking to the original vdev. It's possible this
 	 * vdev may have been detached/replaced while we were
 	 * trying to online it.
 	 */
 	if (generation != spa->spa_config_generation) {
 		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("vdev configuration has changed, "
 			    "guid %llu, state %llu, expected gen %llu, "
 			    "got gen %llu\n",
 			    (u_longlong_t)guid,
 			    (u_longlong_t)tvd->vdev_state,
 			    (u_longlong_t)generation,
 			    (u_longlong_t)spa->spa_config_generation);
 		}
 		return (vd);
 	}
 	return (NULL);
 }
 
 /*
  * Traverse the vdev tree calling the supplied function.
  * We continue to walk the tree until we either have walked all
  * children or we receive a non-NULL return from the callback.
  * If a NULL callback is passed, then we just return back the first
  * leaf vdev we encounter.
  */
 vdev_t *
 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
 {
 	if (vd->vdev_ops->vdev_op_leaf) {
 		if (func == NULL)
 			return (vd);
 		else
 			return (func(vd, arg));
 	}
 
 	for (uint_t c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
 			return (cvd);
 	}
 	return (NULL);
 }
 
 /*
  * Verify that dynamic LUN growth works as expected.
  */
 /* ARGSUSED */
 void
 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa = ztest_spa;
 	vdev_t *vd, *tvd;
 	metaslab_class_t *mc;
 	metaslab_group_t *mg;
 	size_t psize, newsize;
 	uint64_t top;
 	uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
 
 	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	top = ztest_random_vdev_top(spa, B_TRUE);
 
 	tvd = spa->spa_root_vdev->vdev_child[top];
 	mg = tvd->vdev_mg;
 	mc = mg->mg_class;
 	old_ms_count = tvd->vdev_ms_count;
 	old_class_space = metaslab_class_get_space(mc);
 
 	/*
 	 * Determine the size of the first leaf vdev associated with
 	 * our top-level device.
 	 */
 	vd = vdev_walk_tree(tvd, NULL, NULL);
 	ASSERT3P(vd, !=, NULL);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	psize = vd->vdev_psize;
 
 	/*
 	 * We only try to expand the vdev if it's healthy, less than 4x its
 	 * original size, and it has a valid psize.
 	 */
 	if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
 	    psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
 		spa_config_exit(spa, SCL_STATE, spa);
 		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 	ASSERT(psize > 0);
 	newsize = psize + psize / 8;
 	ASSERT3U(newsize, >, psize);
 
 	if (ztest_opts.zo_verbose >= 6) {
 		(void) printf("Expanding LUN %s from %lu to %lu\n",
 		    vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
 	}
 
 	/*
 	 * Growing the vdev is a two step process:
 	 *	1). expand the physical size (i.e. relabel)
 	 *	2). online the vdev to create the new metaslabs
 	 */
 	if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
 	    vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
 	    tvd->vdev_state != VDEV_STATE_HEALTHY) {
 		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("Could not expand LUN because "
 			    "the vdev configuration changed.\n");
 		}
 		spa_config_exit(spa, SCL_STATE, spa);
 		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 
 	spa_config_exit(spa, SCL_STATE, spa);
 
 	/*
 	 * Expanding the LUN will update the config asynchronously,
 	 * thus we must wait for the async thread to complete any
 	 * pending tasks before proceeding.
 	 */
 	for (;;) {
 		boolean_t done;
 		mutex_enter(&spa->spa_async_lock);
 		done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
 		mutex_exit(&spa->spa_async_lock);
 		if (done)
 			break;
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		(void) poll(NULL, 0, 100);
 	}
 
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	tvd = spa->spa_root_vdev->vdev_child[top];
 	new_ms_count = tvd->vdev_ms_count;
 	new_class_space = metaslab_class_get_space(mc);
 
 	if (tvd->vdev_mg != mg || mg->mg_class != mc) {
 		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("Could not verify LUN expansion due to "
 			    "intervening vdev offline or remove.\n");
 		}
 		spa_config_exit(spa, SCL_STATE, spa);
 		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 
 	/*
 	 * Make sure we were able to grow the vdev.
 	 */
 	if (new_ms_count <= old_ms_count)
 		fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
 		    old_ms_count, new_ms_count);
 
 	/*
 	 * Make sure we were able to grow the pool.
 	 */
 	if (new_class_space <= old_class_space)
 		fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
 		    old_class_space, new_class_space);
 
 	if (ztest_opts.zo_verbose >= 5) {
 		char oldnumbuf[6], newnumbuf[6];
 
 		nicenum(old_class_space, oldnumbuf);
 		nicenum(new_class_space, newnumbuf);
 		(void) printf("%s grew from %s to %s\n",
 		    spa->spa_name, oldnumbuf, newnumbuf);
 	}
 
 	spa_config_exit(spa, SCL_STATE, spa);
 	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 }
 
 /*
  * Verify that dmu_objset_{create,destroy,open,close} work as expected.
  */
 /* ARGSUSED */
 static void
 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	/*
 	 * Create the objects common to all ztest datasets.
 	 */
 	VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 }
 
 static int
 ztest_dataset_create(char *dsname)
 {
 	uint64_t zilset = ztest_random(100);
 	int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
 	    ztest_objset_create_cb, NULL);
 
 	if (err || zilset < 80)
 		return (err);
 
 	if (ztest_opts.zo_verbose >= 6)
 		(void) printf("Setting dataset %s to sync always\n", dsname);
 	return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
 	    ZFS_SYNC_ALWAYS, B_FALSE));
 }
 
 /* ARGSUSED */
 static int
 ztest_objset_destroy_cb(const char *name, void *arg)
 {
 	objset_t *os;
 	dmu_object_info_t doi;
 	int error;
 
 	/*
 	 * Verify that the dataset contains a directory object.
 	 */
 	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os));
 	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
 	if (error != ENOENT) {
 		/* We could have crashed in the middle of destroying it */
 		ASSERT0(error);
 		ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
 		ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
 	}
 	dmu_objset_disown(os, FTAG);
 
 	/*
 	 * Destroy the dataset.
 	 */
 	if (strchr(name, '@') != NULL) {
 		VERIFY0(dsl_destroy_snapshot(name, B_FALSE));
 	} else {
 		VERIFY0(dsl_destroy_head(name));
 	}
 	return (0);
 }
 
 static boolean_t
 ztest_snapshot_create(char *osname, uint64_t id)
 {
 	char snapname[MAXNAMELEN];
 	int error;
 
 	(void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
 
 	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
 		return (B_FALSE);
 	}
 	if (error != 0 && error != EEXIST) {
 		fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
 		    snapname, error);
 	}
 	return (B_TRUE);
 }
 
 static boolean_t
 ztest_snapshot_destroy(char *osname, uint64_t id)
 {
 	char snapname[MAXNAMELEN];
 	int error;
 
 	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
 	    (u_longlong_t)id);
 
 	error = dsl_destroy_snapshot(snapname, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
 	return (B_TRUE);
 }
 
 /* ARGSUSED */
 void
 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_ds_t zdtmp;
 	int iters;
 	int error;
 	objset_t *os, *os2;
 	char name[MAXNAMELEN];
 	zilog_t *zilog;
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	(void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
 	    ztest_opts.zo_pool, (u_longlong_t)id);
 
 	/*
 	 * If this dataset exists from a previous run, process its replay log
 	 * half of the time.  If we don't replay it, then dmu_objset_destroy()
 	 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
 	 */
 	if (ztest_random(2) == 0 &&
 	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
 		ztest_zd_init(&zdtmp, NULL, os);
 		zil_replay(os, &zdtmp, ztest_replay_vector);
 		ztest_zd_fini(&zdtmp);
 		dmu_objset_disown(os, FTAG);
 	}
 
 	/*
 	 * There may be an old instance of the dataset we're about to
 	 * create lying around from a previous run.  If so, destroy it
 	 * and all of its snapshots.
 	 */
 	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
 	    DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
 	/*
 	 * Verify that the destroyed dataset is no longer in the namespace.
 	 */
 	VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
 	    FTAG, &os));
 
 	/*
 	 * Verify that we can create a new dataset.
 	 */
 	error = ztest_dataset_create(name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			(void) rw_unlock(&ztest_name_lock);
 			return;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", name, error);
 	}
 
 	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
 
 	ztest_zd_init(&zdtmp, NULL, os);
 
 	/*
 	 * Open the intent log for it.
 	 */
 	zilog = zil_open(os, ztest_get_data);
 
 	/*
 	 * Put some objects in there, do a little I/O to them,
 	 * and randomly take a couple of snapshots along the way.
 	 */
 	iters = ztest_random(5);
 	for (int i = 0; i < iters; i++) {
 		ztest_dmu_object_alloc_free(&zdtmp, id);
 		if (ztest_random(iters) == 0)
 			(void) ztest_snapshot_create(name, i);
 	}
 
 	/*
 	 * Verify that we cannot create an existing dataset.
 	 */
 	VERIFY3U(EEXIST, ==,
 	    dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
 
 	/*
 	 * Verify that we can hold an objset that is also owned.
 	 */
 	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
 	dmu_objset_rele(os2, FTAG);
 
 	/*
 	 * Verify that we cannot own an objset that is already owned.
 	 */
 	VERIFY3U(EBUSY, ==,
 	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
 
 	zil_close(zilog);
 	dmu_objset_disown(os, FTAG);
 	ztest_zd_fini(&zdtmp);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
  * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
  */
 void
 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
 	(void) rw_rdlock(&ztest_name_lock);
 	(void) ztest_snapshot_destroy(zd->zd_name, id);
 	(void) ztest_snapshot_create(zd->zd_name, id);
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
  * Cleanup non-standard snapshots and clones.
  */
 void
 ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
 {
 	char snap1name[MAXNAMELEN];
 	char clone1name[MAXNAMELEN];
 	char snap2name[MAXNAMELEN];
 	char clone2name[MAXNAMELEN];
 	char snap3name[MAXNAMELEN];
 	int error;
 
 	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
 	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
 	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
 	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
 	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
 
 	error = dsl_destroy_head(clone2name);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
 	error = dsl_destroy_snapshot(snap3name, B_FALSE);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
 	error = dsl_destroy_snapshot(snap2name, B_FALSE);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
 	error = dsl_destroy_head(clone1name);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
 	error = dsl_destroy_snapshot(snap1name, B_FALSE);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
 }
 
 /*
  * Verify dsl_dataset_promote handles EBUSY
  */
 void
 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os;
 	char snap1name[MAXNAMELEN];
 	char clone1name[MAXNAMELEN];
 	char snap2name[MAXNAMELEN];
 	char clone2name[MAXNAMELEN];
 	char snap3name[MAXNAMELEN];
 	char *osname = zd->zd_name;
 	int error;
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	ztest_dsl_dataset_cleanup(osname, id);
 
 	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
 	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
 	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
 	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
 	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
 
 	error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
 	}
 
 	error = dmu_objset_clone(clone1name, snap1name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
 	}
 
 	error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
 	}
 
 	error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
 	}
 
 	error = dmu_objset_clone(clone2name, snap3name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
 	}
 
 	error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
 	if (error)
 		fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
 	error = dsl_dataset_promote(clone2name, NULL);
 	if (error == ENOSPC) {
 		dmu_objset_disown(os, FTAG);
 		ztest_record_enospc(FTAG);
 		goto out;
 	}
 	if (error != EBUSY)
 		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
 		    error);
 	dmu_objset_disown(os, FTAG);
 
 out:
 	ztest_dsl_dataset_cleanup(osname, id);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
  * Verify that dmu_object_{alloc,free} work as expected.
  */
 void
 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_od_t od[4];
 	int batchsize = sizeof (od) / sizeof (od[0]);
 
 	for (int b = 0; b < batchsize; b++)
 		ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
 
 	/*
 	 * Destroy the previous batch of objects, create a new batch,
 	 * and do some I/O on the new objects.
 	 */
 	if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
 		return;
 
 	while (ztest_random(4 * batchsize) != 0)
 		ztest_io(zd, od[ztest_random(batchsize)].od_object,
 		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 }
 
 /*
  * Verify that dmu_{read,write} work as expected.
  */
 void
 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t od[2];
 	dmu_tx_t *tx;
 	int i, freeit, error;
 	uint64_t n, s, txg;
 	bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
 	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
 	uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
 	uint64_t regions = 997;
 	uint64_t stride = 123456789ULL;
 	uint64_t width = 40;
 	int free_percent = 5;
 
 	/*
 	 * This test uses two objects, packobj and bigobj, that are always
 	 * updated together (i.e. in the same tx) so that their contents are
 	 * in sync and can be compared.  Their contents relate to each other
 	 * in a simple way: packobj is a dense array of 'bufwad' structures,
 	 * while bigobj is a sparse array of the same bufwads.  Specifically,
 	 * for any index n, there are three bufwads that should be identical:
 	 *
 	 *	packobj, at offset n * sizeof (bufwad_t)
 	 *	bigobj, at the head of the nth chunk
 	 *	bigobj, at the tail of the nth chunk
 	 *
 	 * The chunk size is arbitrary. It doesn't have to be a power of two,
 	 * and it doesn't have any relation to the object blocksize.
 	 * The only requirement is that it can hold at least two bufwads.
 	 *
 	 * Normally, we write the bufwad to each of these locations.
 	 * However, free_percent of the time we instead write zeroes to
 	 * packobj and perform a dmu_free_range() on bigobj.  By comparing
 	 * bigobj to packobj, we can verify that the DMU is correctly
 	 * tracking which parts of an object are allocated and free,
 	 * and that the contents of the allocated blocks are correct.
 	 */
 
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
 	ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
 
 	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
 		return;
 
 	bigobj = od[0].od_object;
 	packobj = od[1].od_object;
 	chunksize = od[0].od_gen;
 	ASSERT(chunksize == od[1].od_gen);
 
 	/*
 	 * Prefetch a random chunk of the big object.
 	 * Our aim here is to get some async reads in flight
 	 * for blocks that we may free below; the DMU should
 	 * handle this race correctly.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(2 * width - 1);
 	dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
 	    ZIO_PRIORITY_SYNC_READ);
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(width - 1);
 
 	packoff = n * sizeof (bufwad_t);
 	packsize = s * sizeof (bufwad_t);
 
 	bigoff = n * chunksize;
 	bigsize = s * chunksize;
 
 	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
 	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
 
 	/*
 	 * free_percent of the time, free a range of bigobj rather than
 	 * overwriting it.
 	 */
 	freeit = (ztest_random(100) < free_percent);
 
 	/*
 	 * Read the current contents of our objects.
 	 */
 	error = dmu_read(os, packobj, packoff, packsize, packbuf,
 	    DMU_READ_PREFETCH);
 	ASSERT0(error);
 	error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
 	    DMU_READ_PREFETCH);
 	ASSERT0(error);
 
 	/*
 	 * Get a tx for the mods to both packobj and bigobj.
 	 */
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, packobj, packoff, packsize);
 
 	if (freeit)
 		dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
 	else
 		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
 
 	/* This accounts for setting the checksum/compression. */
 	dmu_tx_hold_bonus(tx, bigobj);
 
 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 	if (txg == 0) {
 		umem_free(packbuf, packsize);
 		umem_free(bigbuf, bigsize);
 		return;
 	}
 
 	enum zio_checksum cksum;
 	do {
 		cksum = (enum zio_checksum)
 		    ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
 	} while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 	dmu_object_set_checksum(os, bigobj, cksum, tx);
 
 	enum zio_compress comp;
 	do {
 		comp = (enum zio_compress)
 		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
 	} while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
 	dmu_object_set_compress(os, bigobj, comp, tx);
 
 	/*
 	 * For each index from n to n + s, verify that the existing bufwad
 	 * in packobj matches the bufwads at the head and tail of the
 	 * corresponding chunk in bigobj.  Then update all three bufwads
 	 * with the new values we want to write out.
 	 */
 	for (i = 0; i < s; i++) {
 		/* LINTED */
 		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
 		/* LINTED */
 		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
 		/* LINTED */
 		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
 
 		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
 		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
 
 		if (pack->bw_txg > txg)
 			fatal(0, "future leak: got %llx, open txg is %llx",
 			    pack->bw_txg, txg);
 
 		if (pack->bw_data != 0 && pack->bw_index != n + i)
 			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
 			    pack->bw_index, n, i);
 
 		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
 
 		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
 
 		if (freeit) {
 			bzero(pack, sizeof (bufwad_t));
 		} else {
 			pack->bw_index = n + i;
 			pack->bw_txg = txg;
 			pack->bw_data = 1 + ztest_random(-2ULL);
 		}
 		*bigH = *pack;
 		*bigT = *pack;
 	}
 
 	/*
 	 * We've verified all the old bufwads, and made new ones.
 	 * Now write them out.
 	 */
 	dmu_write(os, packobj, packoff, packsize, packbuf, tx);
 
 	if (freeit) {
 		if (ztest_opts.zo_verbose >= 7) {
 			(void) printf("freeing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
 	} else {
 		if (ztest_opts.zo_verbose >= 7) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Sanity check the stuff we just wrote.
 	 */
 	{
 		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
 		VERIFY(0 == dmu_read(os, packobj, packoff,
 		    packsize, packcheck, DMU_READ_PREFETCH));
 		VERIFY(0 == dmu_read(os, bigobj, bigoff,
 		    bigsize, bigcheck, DMU_READ_PREFETCH));
 
 		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
 		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
 
 		umem_free(packcheck, packsize);
 		umem_free(bigcheck, bigsize);
 	}
 
 	umem_free(packbuf, packsize);
 	umem_free(bigbuf, bigsize);
 }
 
 void
 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
     uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
 {
 	uint64_t i;
 	bufwad_t *pack;
 	bufwad_t *bigH;
 	bufwad_t *bigT;
 
 	/*
 	 * For each index from n to n + s, verify that the existing bufwad
 	 * in packobj matches the bufwads at the head and tail of the
 	 * corresponding chunk in bigobj.  Then update all three bufwads
 	 * with the new values we want to write out.
 	 */
 	for (i = 0; i < s; i++) {
 		/* LINTED */
 		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
 		/* LINTED */
 		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
 		/* LINTED */
 		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
 
 		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
 		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
 
 		if (pack->bw_txg > txg)
 			fatal(0, "future leak: got %llx, open txg is %llx",
 			    pack->bw_txg, txg);
 
 		if (pack->bw_data != 0 && pack->bw_index != n + i)
 			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
 			    pack->bw_index, n, i);
 
 		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
 
 		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
 
 		pack->bw_index = n + i;
 		pack->bw_txg = txg;
 		pack->bw_data = 1 + ztest_random(-2ULL);
 
 		*bigH = *pack;
 		*bigT = *pack;
 	}
 }
 
 void
 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t od[2];
 	dmu_tx_t *tx;
 	uint64_t i;
 	int error;
 	uint64_t n, s, txg;
 	bufwad_t *packbuf, *bigbuf;
 	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
 	uint64_t blocksize = ztest_random_blocksize();
 	uint64_t chunksize = blocksize;
 	uint64_t regions = 997;
 	uint64_t stride = 123456789ULL;
 	uint64_t width = 9;
 	dmu_buf_t *bonus_db;
 	arc_buf_t **bigbuf_arcbufs;
 	dmu_object_info_t doi;
 
 	/*
 	 * This test uses two objects, packobj and bigobj, that are always
 	 * updated together (i.e. in the same tx) so that their contents are
 	 * in sync and can be compared.  Their contents relate to each other
 	 * in a simple way: packobj is a dense array of 'bufwad' structures,
 	 * while bigobj is a sparse array of the same bufwads.  Specifically,
 	 * for any index n, there are three bufwads that should be identical:
 	 *
 	 *	packobj, at offset n * sizeof (bufwad_t)
 	 *	bigobj, at the head of the nth chunk
 	 *	bigobj, at the tail of the nth chunk
 	 *
 	 * The chunk size is set equal to bigobj block size so that
 	 * dmu_assign_arcbuf() can be tested for object updates.
 	 */
 
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
 	ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
 
 	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
 		return;
 
 	bigobj = od[0].od_object;
 	packobj = od[1].od_object;
 	blocksize = od[0].od_blocksize;
 	chunksize = blocksize;
 	ASSERT(chunksize == od[1].od_gen);
 
 	VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
 	VERIFY(ISP2(doi.doi_data_block_size));
 	VERIFY(chunksize == doi.doi_data_block_size);
 	VERIFY(chunksize >= 2 * sizeof (bufwad_t));
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(width - 1);
 
 	packoff = n * sizeof (bufwad_t);
 	packsize = s * sizeof (bufwad_t);
 
 	bigoff = n * chunksize;
 	bigsize = s * chunksize;
 
 	packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
 	bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
 
 	bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
 
 	/*
 	 * Iteration 0 test zcopy for DB_UNCACHED dbufs.
 	 * Iteration 1 test zcopy to already referenced dbufs.
 	 * Iteration 2 test zcopy to dirty dbuf in the same txg.
 	 * Iteration 3 test zcopy to dbuf dirty in previous txg.
 	 * Iteration 4 test zcopy when dbuf is no longer dirty.
 	 * Iteration 5 test zcopy when it can't be done.
 	 * Iteration 6 one more zcopy write.
 	 */
 	for (i = 0; i < 7; i++) {
 		uint64_t j;
 		uint64_t off;
 
 		/*
 		 * In iteration 5 (i == 5) use arcbufs
 		 * that don't match bigobj blksz to test
 		 * dmu_assign_arcbuf() when it can't directly
 		 * assign an arcbuf to a dbuf.
 		 */
 		for (j = 0; j < s; j++) {
 			if (i != 5) {
 				bigbuf_arcbufs[j] =
 				    dmu_request_arcbuf(bonus_db, chunksize);
 			} else {
 				bigbuf_arcbufs[2 * j] =
 				    dmu_request_arcbuf(bonus_db, chunksize / 2);
 				bigbuf_arcbufs[2 * j + 1] =
 				    dmu_request_arcbuf(bonus_db, chunksize / 2);
 			}
 		}
 
 		/*
 		 * Get a tx for the mods to both packobj and bigobj.
 		 */
 		tx = dmu_tx_create(os);
 
 		dmu_tx_hold_write(tx, packobj, packoff, packsize);
 		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
 
 		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 		if (txg == 0) {
 			umem_free(packbuf, packsize);
 			umem_free(bigbuf, bigsize);
 			for (j = 0; j < s; j++) {
 				if (i != 5) {
 					dmu_return_arcbuf(bigbuf_arcbufs[j]);
 				} else {
 					dmu_return_arcbuf(
 					    bigbuf_arcbufs[2 * j]);
 					dmu_return_arcbuf(
 					    bigbuf_arcbufs[2 * j + 1]);
 				}
 			}
 			umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
 			dmu_buf_rele(bonus_db, FTAG);
 			return;
 		}
 
 		/*
 		 * 50% of the time don't read objects in the 1st iteration to
 		 * test dmu_assign_arcbuf() for the case when there're no
 		 * existing dbufs for the specified offsets.
 		 */
 		if (i != 0 || ztest_random(2) != 0) {
 			error = dmu_read(os, packobj, packoff,
 			    packsize, packbuf, DMU_READ_PREFETCH);
 			ASSERT0(error);
 			error = dmu_read(os, bigobj, bigoff, bigsize,
 			    bigbuf, DMU_READ_PREFETCH);
 			ASSERT0(error);
 		}
 		compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
 		    n, chunksize, txg);
 
 		/*
 		 * We've verified all the old bufwads, and made new ones.
 		 * Now write them out.
 		 */
 		dmu_write(os, packobj, packoff, packsize, packbuf, tx);
 		if (ztest_opts.zo_verbose >= 7) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
 			dmu_buf_t *dbt;
 			if (i != 5) {
 				bcopy((caddr_t)bigbuf + (off - bigoff),
 				    bigbuf_arcbufs[j]->b_data, chunksize);
 			} else {
 				bcopy((caddr_t)bigbuf + (off - bigoff),
 				    bigbuf_arcbufs[2 * j]->b_data,
 				    chunksize / 2);
 				bcopy((caddr_t)bigbuf + (off - bigoff) +
 				    chunksize / 2,
 				    bigbuf_arcbufs[2 * j + 1]->b_data,
 				    chunksize / 2);
 			}
 
 			if (i == 1) {
 				VERIFY(dmu_buf_hold(os, bigobj, off,
 				    FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
 			}
 			if (i != 5) {
 				dmu_assign_arcbuf(bonus_db, off,
 				    bigbuf_arcbufs[j], tx);
 			} else {
 				dmu_assign_arcbuf(bonus_db, off,
 				    bigbuf_arcbufs[2 * j], tx);
 				dmu_assign_arcbuf(bonus_db,
 				    off + chunksize / 2,
 				    bigbuf_arcbufs[2 * j + 1], tx);
 			}
 			if (i == 1) {
 				dmu_buf_rele(dbt, FTAG);
 			}
 		}
 		dmu_tx_commit(tx);
 
 		/*
 		 * Sanity check the stuff we just wrote.
 		 */
 		{
 			void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 			void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
 			VERIFY(0 == dmu_read(os, packobj, packoff,
 			    packsize, packcheck, DMU_READ_PREFETCH));
 			VERIFY(0 == dmu_read(os, bigobj, bigoff,
 			    bigsize, bigcheck, DMU_READ_PREFETCH));
 
 			ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
 			ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
 
 			umem_free(packcheck, packsize);
 			umem_free(bigcheck, bigsize);
 		}
 		if (i == 2) {
 			txg_wait_open(dmu_objset_pool(os), 0);
 		} else if (i == 3) {
 			txg_wait_synced(dmu_objset_pool(os), 0);
 		}
 	}
 
 	dmu_buf_rele(bonus_db, FTAG);
 	umem_free(packbuf, packsize);
 	umem_free(bigbuf, bigsize);
 	umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
 }
 
 /* ARGSUSED */
 void
 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_od_t od[1];
 	uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
 	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 
 	/*
 	 * Have multiple threads write to large offsets in an object
 	 * to verify that parallel writes to an object -- even to the
 	 * same blocks within the object -- doesn't cause any trouble.
 	 */
 	ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
 		return;
 
 	while (ztest_random(10) != 0)
 		ztest_io(zd, od[0].od_object, offset);
 }
 
 void
 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_od_t od[1];
 	uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
 	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 	uint64_t count = ztest_random(20) + 1;
 	uint64_t blocksize = ztest_random_blocksize();
 	void *data;
 
 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
 
 	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
 		return;
 
 	if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
 		return;
 
 	ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);
 
 	data = umem_zalloc(blocksize, UMEM_NOFAIL);
 
 	while (ztest_random(count) != 0) {
 		uint64_t randoff = offset + (ztest_random(count) * blocksize);
 		if (ztest_write(zd, od[0].od_object, randoff, blocksize,
 		    data) != 0)
 			break;
 		while (ztest_random(4) != 0)
 			ztest_io(zd, od[0].od_object, randoff);
 	}
 
 	umem_free(data, blocksize);
 }
 
 /*
  * Verify that zap_{create,destroy,add,remove,update} work as expected.
  */
 #define	ZTEST_ZAP_MIN_INTS	1
 #define	ZTEST_ZAP_MAX_INTS	4
 #define	ZTEST_ZAP_MAX_PROPS	1000
 
 void
 ztest_zap(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t od[1];
 	uint64_t object;
 	uint64_t txg, last_txg;
 	uint64_t value[ZTEST_ZAP_MAX_INTS];
 	uint64_t zl_ints, zl_intsize, prop;
 	int i, ints;
 	dmu_tx_t *tx;
 	char propname[100], txgname[100];
 	int error;
 	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
 
 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
 		return;
 
 	object = od[0].od_object;
 
 	/*
 	 * Generate a known hash collision, and verify that
 	 * we can lookup and remove both entries.
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 	if (txg == 0)
 		return;
 	for (i = 0; i < 2; i++) {
 		value[i] = i;
 		VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
 		    1, &value[i], tx));
 	}
 	for (i = 0; i < 2; i++) {
 		VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
 		    sizeof (uint64_t), 1, &value[i], tx));
 		VERIFY3U(0, ==,
 		    zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
 		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 		ASSERT3U(zl_ints, ==, 1);
 	}
 	for (i = 0; i < 2; i++) {
 		VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
 	}
 	dmu_tx_commit(tx);
 
 	/*
 	 * Generate a buch of random entries.
 	 */
 	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
 
 	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
 	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
 	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 	bzero(value, sizeof (value));
 	last_txg = 0;
 
 	/*
 	 * If these zap entries already exist, validate their contents.
 	 */
 	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
 	if (error == 0) {
 		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 		ASSERT3U(zl_ints, ==, 1);
 
 		VERIFY(zap_lookup(os, object, txgname, zl_intsize,
 		    zl_ints, &last_txg) == 0);
 
 		VERIFY(zap_length(os, object, propname, &zl_intsize,
 		    &zl_ints) == 0);
 
 		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 		ASSERT3U(zl_ints, ==, ints);
 
 		VERIFY(zap_lookup(os, object, propname, zl_intsize,
 		    zl_ints, value) == 0);
 
 		for (i = 0; i < ints; i++) {
 			ASSERT3U(value[i], ==, last_txg + object + i);
 		}
 	} else {
 		ASSERT3U(error, ==, ENOENT);
 	}
 
 	/*
 	 * Atomically update two entries in our zap object.
 	 * The first is named txg_%llu, and contains the txg
 	 * in which the property was last updated.  The second
 	 * is named prop_%llu, and the nth element of its value
 	 * should be txg + object + n.
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 	if (txg == 0)
 		return;
 
 	if (last_txg > txg)
 		fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
 
 	for (i = 0; i < ints; i++)
 		value[i] = txg + object + i;
 
 	VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
 	    1, &txg, tx));
 	VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
 	    ints, value, tx));
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Remove a random pair of entries.
 	 */
 	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
 	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
 	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 
 	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
 
 	if (error == ENOENT)
 		return;
 
 	ASSERT0(error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 	if (txg == 0)
 		return;
 	VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
 	VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
 	dmu_tx_commit(tx);
 }
 
 /*
  * Testcase to test the upgrading of a microzap to fatzap.
  */
 void
 ztest_fzap(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t od[1];
 	uint64_t object, txg;
 
 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
 		return;
 
 	object = od[0].od_object;
 
 	/*
 	 * Add entries to this ZAP and make sure it spills over
 	 * and gets upgraded to a fatzap. Also, since we are adding
 	 * 2050 entries we should see ptrtbl growth and leaf-block split.
 	 */
 	for (int i = 0; i < 2050; i++) {
 		char name[MAXNAMELEN];
 		uint64_t value = i;
 		dmu_tx_t *tx;
 		int error;
 
 		(void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
 		    id, value);
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_zap(tx, object, B_TRUE, name);
 		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 		if (txg == 0)
 			return;
 		error = zap_add(os, object, name, sizeof (uint64_t), 1,
 		    &value, tx);
 		ASSERT(error == 0 || error == EEXIST);
 		dmu_tx_commit(tx);
 	}
 }
 
 /* ARGSUSED */
 void
 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t od[1];
 	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
 	dmu_tx_t *tx;
 	int i, namelen, error;
 	int micro = ztest_random(2);
 	char name[20], string_value[20];
 	void *data;
 
 	ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
 		return;
 
 	object = od[0].od_object;
 
 	/*
 	 * Generate a random name of the form 'xxx.....' where each
 	 * x is a random printable character and the dots are dots.
 	 * There are 94 such characters, and the name length goes from
 	 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
 	 */
 	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
 
 	for (i = 0; i < 3; i++)
 		name[i] = '!' + ztest_random('~' - '!' + 1);
 	for (; i < namelen - 1; i++)
 		name[i] = '.';
 	name[i] = '\0';
 
 	if ((namelen & 1) || micro) {
 		wsize = sizeof (txg);
 		wc = 1;
 		data = &txg;
 	} else {
 		wsize = 1;
 		wc = namelen;
 		data = string_value;
 	}
 
 	count = -1ULL;
 	VERIFY0(zap_count(os, object, &count));
 	ASSERT(count != -1ULL);
 
 	/*
 	 * Select an operation: length, lookup, add, update, remove.
 	 */
 	i = ztest_random(5);
 
 	if (i >= 2) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
 		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 		if (txg == 0)
 			return;
 		bcopy(name, string_value, namelen);
 	} else {
 		tx = NULL;
 		txg = 0;
 		bzero(string_value, namelen);
 	}
 
 	switch (i) {
 
 	case 0:
 		error = zap_length(os, object, name, &zl_wsize, &zl_wc);
 		if (error == 0) {
 			ASSERT3U(wsize, ==, zl_wsize);
 			ASSERT3U(wc, ==, zl_wc);
 		} else {
 			ASSERT3U(error, ==, ENOENT);
 		}
 		break;
 
 	case 1:
 		error = zap_lookup(os, object, name, wsize, wc, data);
 		if (error == 0) {
 			if (data == string_value &&
 			    bcmp(name, data, namelen) != 0)
 				fatal(0, "name '%s' != val '%s' len %d",
 				    name, data, namelen);
 		} else {
 			ASSERT3U(error, ==, ENOENT);
 		}
 		break;
 
 	case 2:
 		error = zap_add(os, object, name, wsize, wc, data, tx);
 		ASSERT(error == 0 || error == EEXIST);
 		break;
 
 	case 3:
 		VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
 		break;
 
 	case 4:
 		error = zap_remove(os, object, name, tx);
 		ASSERT(error == 0 || error == ENOENT);
 		break;
 	}
 
 	if (tx != NULL)
 		dmu_tx_commit(tx);
 }
 
 /*
  * Commit callback data.
  */
 typedef struct ztest_cb_data {
 	list_node_t		zcd_node;
 	uint64_t		zcd_txg;
 	int			zcd_expected_err;
 	boolean_t		zcd_added;
 	boolean_t		zcd_called;
 	spa_t			*zcd_spa;
 } ztest_cb_data_t;
 
 /* This is the actual commit callback function */
 static void
 ztest_commit_callback(void *arg, int error)
 {
 	ztest_cb_data_t *data = arg;
 	uint64_t synced_txg;
 
 	VERIFY(data != NULL);
 	VERIFY3S(data->zcd_expected_err, ==, error);
 	VERIFY(!data->zcd_called);
 
 	synced_txg = spa_last_synced_txg(data->zcd_spa);
 	if (data->zcd_txg > synced_txg)
 		fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
 		    ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
 		    synced_txg);
 
 	data->zcd_called = B_TRUE;
 
 	if (error == ECANCELED) {
 		ASSERT0(data->zcd_txg);
 		ASSERT(!data->zcd_added);
 
 		/*
 		 * The private callback data should be destroyed here, but
 		 * since we are going to check the zcd_called field after
 		 * dmu_tx_abort(), we will destroy it there.
 		 */
 		return;
 	}
 
 	/* Was this callback added to the global callback list? */
 	if (!data->zcd_added)
 		goto out;
 
 	ASSERT3U(data->zcd_txg, !=, 0);
 
 	/* Remove our callback from the list */
 	(void) mutex_lock(&zcl.zcl_callbacks_lock);
 	list_remove(&zcl.zcl_callbacks, data);
 	(void) mutex_unlock(&zcl.zcl_callbacks_lock);
 
 out:
 	umem_free(data, sizeof (ztest_cb_data_t));
 }
 
 /* Allocate and initialize callback data structure */
 static ztest_cb_data_t *
 ztest_create_cb_data(objset_t *os, uint64_t txg)
 {
 	ztest_cb_data_t *cb_data;
 
 	cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
 
 	cb_data->zcd_txg = txg;
 	cb_data->zcd_spa = dmu_objset_spa(os);
 
 	return (cb_data);
 }
 
 /*
  * If a number of txgs equal to this threshold have been created after a commit
  * callback has been registered but not called, then we assume there is an
  * implementation bug.
  */
 #define	ZTEST_COMMIT_CALLBACK_THRESH	(TXG_CONCURRENT_STATES + 2)
 
 /*
  * Commit callback test.
  */
 void
 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t od[1];
 	dmu_tx_t *tx;
 	ztest_cb_data_t *cb_data[3], *tmp_cb;
 	uint64_t old_txg, txg;
 	int i, error;
 
 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
 		return;
 
 	tx = dmu_tx_create(os);
 
 	cb_data[0] = ztest_create_cb_data(os, 0);
 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
 
 	dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
 
 	/* Every once in a while, abort the transaction on purpose */
 	if (ztest_random(100) == 0)
 		error = -1;
 
 	if (!error)
 		error = dmu_tx_assign(tx, TXG_NOWAIT);
 
 	txg = error ? 0 : dmu_tx_get_txg(tx);
 
 	cb_data[0]->zcd_txg = txg;
 	cb_data[1] = ztest_create_cb_data(os, txg);
 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
 
 	if (error) {
 		/*
 		 * It's not a strict requirement to call the registered
 		 * callbacks from inside dmu_tx_abort(), but that's what
 		 * it's supposed to happen in the current implementation
 		 * so we will check for that.
 		 */
 		for (i = 0; i < 2; i++) {
 			cb_data[i]->zcd_expected_err = ECANCELED;
 			VERIFY(!cb_data[i]->zcd_called);
 		}
 
 		dmu_tx_abort(tx);
 
 		for (i = 0; i < 2; i++) {
 			VERIFY(cb_data[i]->zcd_called);
 			umem_free(cb_data[i], sizeof (ztest_cb_data_t));
 		}
 
 		return;
 	}
 
 	cb_data[2] = ztest_create_cb_data(os, txg);
 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
 
 	/*
 	 * Read existing data to make sure there isn't a future leak.
 	 */
 	VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
 	    &old_txg, DMU_READ_PREFETCH));
 
 	if (old_txg > txg)
 		fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
 		    old_txg, txg);
 
 	dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
 
 	(void) mutex_lock(&zcl.zcl_callbacks_lock);
 
 	/*
 	 * Since commit callbacks don't have any ordering requirement and since
 	 * it is theoretically possible for a commit callback to be called
 	 * after an arbitrary amount of time has elapsed since its txg has been
 	 * synced, it is difficult to reliably determine whether a commit
 	 * callback hasn't been called due to high load or due to a flawed
 	 * implementation.
 	 *
 	 * In practice, we will assume that if after a certain number of txgs a
 	 * commit callback hasn't been called, then most likely there's an
 	 * implementation bug..
 	 */
 	tmp_cb = list_head(&zcl.zcl_callbacks);
 	if (tmp_cb != NULL &&
 	    (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) {
 		fatal(0, "Commit callback threshold exceeded, oldest txg: %"
 		    PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
 	}
 
 	/*
 	 * Let's find the place to insert our callbacks.
 	 *
 	 * Even though the list is ordered by txg, it is possible for the
 	 * insertion point to not be the end because our txg may already be
 	 * quiescing at this point and other callbacks in the open txg
 	 * (from other objsets) may have sneaked in.
 	 */
 	tmp_cb = list_tail(&zcl.zcl_callbacks);
 	while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
 		tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
 
 	/* Add the 3 callbacks to the list */
 	for (i = 0; i < 3; i++) {
 		if (tmp_cb == NULL)
 			list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
 		else
 			list_insert_after(&zcl.zcl_callbacks, tmp_cb,
 			    cb_data[i]);
 
 		cb_data[i]->zcd_added = B_TRUE;
 		VERIFY(!cb_data[i]->zcd_called);
 
 		tmp_cb = cb_data[i];
 	}
 
 	(void) mutex_unlock(&zcl.zcl_callbacks_lock);
 
 	dmu_tx_commit(tx);
 }
 
 /* ARGSUSED */
 void
 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
 {
 	zfs_prop_t proplist[] = {
 		ZFS_PROP_CHECKSUM,
 		ZFS_PROP_COMPRESSION,
 		ZFS_PROP_COPIES,
 		ZFS_PROP_DEDUP
 	};
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
 		(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
 		    ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /* ARGSUSED */
 void
 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
 {
 	nvlist_t *props = NULL;
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
 	    ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
 
 	VERIFY0(spa_prop_get(ztest_spa, &props));
 
 	if (ztest_opts.zo_verbose >= 6)
 		dump_nvlist(props, 4);
 
 	nvlist_free(props);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 static int
 user_release_one(const char *snapname, const char *holdname)
 {
 	nvlist_t *snaps, *holds;
 	int error;
 
 	snaps = fnvlist_alloc();
 	holds = fnvlist_alloc();
 	fnvlist_add_boolean(holds, holdname);
 	fnvlist_add_nvlist(snaps, snapname, holds);
 	fnvlist_free(holds);
 	error = dsl_dataset_user_release(snaps, NULL);
 	fnvlist_free(snaps);
 	return (error);
 }
 
 /*
  * Test snapshot hold/release and deferred destroy.
  */
 void
 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
 {
 	int error;
 	objset_t *os = zd->zd_os;
 	objset_t *origin;
 	char snapname[100];
 	char fullname[100];
 	char clonename[100];
 	char tag[100];
 	char osname[MAXNAMELEN];
 	nvlist_t *holds;
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	dmu_objset_name(os, osname);
 
 	(void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id);
 	(void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
 	(void) snprintf(clonename, sizeof (clonename),
 	    "%s/ch1_%llu", osname, id);
 	(void) snprintf(tag, sizeof (tag), "tag_%llu", id);
 
 	/*
 	 * Clean up from any previous run.
 	 */
 	error = dsl_destroy_head(clonename);
 	if (error != ENOENT)
 		ASSERT0(error);
 	error = user_release_one(fullname, tag);
 	if (error != ESRCH && error != ENOENT)
 		ASSERT0(error);
 	error = dsl_destroy_snapshot(fullname, B_FALSE);
 	if (error != ENOENT)
 		ASSERT0(error);
 
 	/*
 	 * Create snapshot, clone it, mark snap for deferred destroy,
 	 * destroy clone, verify snap was also destroyed.
 	 */
 	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_snapshot");
 			goto out;
 		}
 		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
 	}
 
 	error = dmu_objset_clone(clonename, fullname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_clone");
 			goto out;
 		}
 		fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
 	}
 
 	error = dsl_destroy_snapshot(fullname, B_TRUE);
 	if (error) {
 		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
 		    fullname, error);
 	}
 
 	error = dsl_destroy_head(clonename);
 	if (error)
 		fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
 
 	error = dmu_objset_hold(fullname, FTAG, &origin);
 	if (error != ENOENT)
 		fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
 
 	/*
 	 * Create snapshot, add temporary hold, verify that we can't
 	 * destroy a held snapshot, mark for deferred destroy,
 	 * release hold, verify snapshot was destroyed.
 	 */
 	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_snapshot");
 			goto out;
 		}
 		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
 	}
 
 	holds = fnvlist_alloc();
 	fnvlist_add_string(holds, fullname, tag);
 	error = dsl_dataset_user_hold(holds, 0, NULL);
 	fnvlist_free(holds);
 
 	if (error == ENOSPC) {
 		ztest_record_enospc("dsl_dataset_user_hold");
 		goto out;
 	} else if (error) {
 		fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
 		    fullname, tag, error);
 	}
 
 	error = dsl_destroy_snapshot(fullname, B_FALSE);
 	if (error != EBUSY) {
 		fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
 		    fullname, error);
 	}
 
 	error = dsl_destroy_snapshot(fullname, B_TRUE);
 	if (error) {
 		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
 		    fullname, error);
 	}
 
 	error = user_release_one(fullname, tag);
 	if (error)
 		fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
 
 	VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
 
 out:
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
  * Inject random faults into the on-disk data.
  */
 /* ARGSUSED */
 void
 ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	int fd;
 	uint64_t offset;
 	uint64_t leaves;
 	uint64_t bad = 0x1990c0ffeedecadeULL;
 	uint64_t top, leaf;
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
 	int bshift = SPA_OLD_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
 	int iters = 1000;
 	int maxfaults;
 	int mirror_save;
 	vdev_t *vd0 = NULL;
 	uint64_t guid0 = 0;
 	boolean_t islog = B_FALSE;
 
 	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 	maxfaults = MAXFAULTS();
 	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
 	mirror_save = zs->zs_mirrors;
 	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 
 	ASSERT(leaves >= 1);
 
 	/*
 	 * Grab the name lock as reader. There are some operations
 	 * which don't like to have their vdevs changed while
 	 * they are in progress (i.e. spa_change_guid). Those
 	 * operations will have grabbed the name lock as writer.
 	 */
 	(void) rw_rdlock(&ztest_name_lock);
 
 	/*
 	 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	if (ztest_random(2) == 0) {
 		/*
 		 * Inject errors on a normal data device or slog device.
 		 */
 		top = ztest_random_vdev_top(spa, B_TRUE);
 		leaf = ztest_random(leaves) + zs->zs_splits;
 
 		/*
 		 * Generate paths to the first leaf in this top-level vdev,
 		 * and to the random leaf we selected.  We'll induce transient
 		 * write failures and random online/offline activity on leaf 0,
 		 * and we'll write random garbage to the randomly chosen leaf.
 		 */
 		(void) snprintf(path0, sizeof (path0), ztest_dev_template,
 		    ztest_opts.zo_dir, ztest_opts.zo_pool,
 		    top * leaves + zs->zs_splits);
 		(void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
 		    ztest_opts.zo_dir, ztest_opts.zo_pool,
 		    top * leaves + leaf);
 
 		vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
 		if (vd0 != NULL && vd0->vdev_top->vdev_islog)
 			islog = B_TRUE;
 
 		/*
 		 * If the top-level vdev needs to be resilvered
 		 * then we only allow faults on the device that is
 		 * resilvering.
 		 */
 		if (vd0 != NULL && maxfaults != 1 &&
 		    (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
 		    vd0->vdev_resilver_txg != 0)) {
 			/*
 			 * Make vd0 explicitly claim to be unreadable,
 			 * or unwriteable, or reach behind its back
 			 * and close the underlying fd.  We can do this if
 			 * maxfaults == 0 because we'll fail and reexecute,
 			 * and we can do it if maxfaults >= 2 because we'll
 			 * have enough redundancy.  If maxfaults == 1, the
 			 * combination of this with injection of random data
 			 * corruption below exceeds the pool's fault tolerance.
 			 */
 			vdev_file_t *vf = vd0->vdev_tsd;
 
 			if (vf != NULL && ztest_random(3) == 0) {
 				(void) close(vf->vf_vnode->v_fd);
 				vf->vf_vnode->v_fd = -1;
 			} else if (ztest_random(2) == 0) {
 				vd0->vdev_cant_read = B_TRUE;
 			} else {
 				vd0->vdev_cant_write = B_TRUE;
 			}
 			guid0 = vd0->vdev_guid;
 		}
 	} else {
 		/*
 		 * Inject errors on an l2cache device.
 		 */
 		spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 		if (sav->sav_count == 0) {
 			spa_config_exit(spa, SCL_STATE, FTAG);
 			(void) rw_unlock(&ztest_name_lock);
 			return;
 		}
 		vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
 		guid0 = vd0->vdev_guid;
 		(void) strcpy(path0, vd0->vdev_path);
 		(void) strcpy(pathrand, vd0->vdev_path);
 
 		leaf = 0;
 		leaves = 1;
 		maxfaults = INT_MAX;	/* no limit on cache devices */
 	}
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 	(void) rw_unlock(&ztest_name_lock);
 
 	/*
 	 * If we can tolerate two or more faults, or we're dealing
 	 * with a slog, randomly online/offline vd0.
 	 */
 	if ((maxfaults >= 2 || islog) && guid0 != 0) {
 		if (ztest_random(10) < 6) {
 			int flags = (ztest_random(2) == 0 ?
 			    ZFS_OFFLINE_TEMPORARY : 0);
 
 			/*
 			 * We have to grab the zs_name_lock as writer to
 			 * prevent a race between offlining a slog and
 			 * destroying a dataset. Offlining the slog will
 			 * grab a reference on the dataset which may cause
 			 * dmu_objset_destroy() to fail with EBUSY thus
 			 * leaving the dataset in an inconsistent state.
 			 */
 			if (islog)
 				(void) rw_wrlock(&ztest_name_lock);
 
 			VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
 
 			if (islog)
 				(void) rw_unlock(&ztest_name_lock);
 		} else {
 			/*
 			 * Ideally we would like to be able to randomly
 			 * call vdev_[on|off]line without holding locks
 			 * to force unpredictable failures but the side
 			 * effects of vdev_[on|off]line prevent us from
 			 * doing so. We grab the ztest_vdev_lock here to
 			 * prevent a race between injection testing and
 			 * aux_vdev removal.
 			 */
 			VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 			(void) vdev_online(spa, guid0, 0, NULL);
 			VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		}
 	}
 
 	if (maxfaults == 0)
 		return;
 
 	/*
 	 * We have at least single-fault tolerance, so inject data corruption.
 	 */
 	fd = open(pathrand, O_RDWR);
 
 	if (fd == -1)	/* we hit a gap in the device namespace */
 		return;
 
 	fsize = lseek(fd, 0, SEEK_END);
 
 	while (--iters != 0) {
 		offset = ztest_random(fsize / (leaves << bshift)) *
 		    (leaves << bshift) + (leaf << bshift) +
 		    (ztest_random(1ULL << (bshift - 1)) & -8ULL);
 
 		if (offset >= fsize)
 			continue;
 
 		VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 		if (mirror_save != zs->zs_mirrors) {
 			VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 			(void) close(fd);
 			return;
 		}
 
 		if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
 			fatal(1, "can't inject bad word at 0x%llx in %s",
 			    offset, pathrand);
 
 		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 
 		if (ztest_opts.zo_verbose >= 7)
 			(void) printf("injected bad word into %s,"
 			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
 	}
 
 	(void) close(fd);
 }
 
 /*
  * Verify that DDT repair works as expected.
  */
 void
 ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	objset_t *os = zd->zd_os;
 	ztest_od_t od[1];
 	uint64_t object, blocksize, txg, pattern, psize;
 	enum zio_checksum checksum = spa_dedup_checksum(spa);
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 	void *buf;
 	blkptr_t blk;
 	int copies = 2 * ZIO_DEDUPDITTO_MIN;
 
 	blocksize = ztest_random_blocksize();
 	blocksize = MIN(blocksize, 2048);	/* because we write so many */
 
 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
 
 	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
 		return;
 
 	/*
 	 * Take the name lock as writer to prevent anyone else from changing
 	 * the pool and dataset properies we need to maintain during this test.
 	 */
 	(void) rw_wrlock(&ztest_name_lock);
 
 	if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
 	    B_FALSE) != 0 ||
 	    ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
 	    B_FALSE) != 0) {
 		(void) rw_unlock(&ztest_name_lock);
 		return;
 	}
 
 	object = od[0].od_object;
 	blocksize = od[0].od_blocksize;
 	pattern = zs->zs_guid ^ dmu_objset_fsid_guid(os);
 
 	ASSERT(object != 0);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, object, 0, copies * blocksize);
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		(void) rw_unlock(&ztest_name_lock);
 		return;
 	}
 
 	/*
 	 * Write all the copies of our block.
 	 */
 	for (int i = 0; i < copies; i++) {
 		uint64_t offset = i * blocksize;
 		int error = dmu_buf_hold(os, object, offset, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (error != 0) {
 			fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
 			    os, (long long)object, (long long) offset, error);
 		}
 		ASSERT(db->db_offset == offset);
 		ASSERT(db->db_size == blocksize);
 		ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
 		    ztest_pattern_match(db->db_data, db->db_size, 0ULL));
 		dmu_buf_will_fill(db, tx);
 		ztest_pattern_set(db->db_data, db->db_size, pattern);
 		dmu_buf_rele(db, FTAG);
 	}
 
 	dmu_tx_commit(tx);
 	txg_wait_synced(spa_get_dsl(spa), txg);
 
 	/*
 	 * Find out what block we got.
 	 */
 	VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
 	    DMU_READ_NO_PREFETCH));
 	blk = *((dmu_buf_impl_t *)db)->db_blkptr;
 	dmu_buf_rele(db, FTAG);
 
 	/*
 	 * Damage the block.  Dedup-ditto will save us when we read it later.
 	 */
 	psize = BP_GET_PSIZE(&blk);
 	buf = zio_buf_alloc(psize);
 	ztest_pattern_set(buf, psize, ~pattern);
 
 	(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
 	    buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
 
 	zio_buf_free(buf, psize);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
  * Scrub the pool.
  */
 /* ARGSUSED */
 void
 ztest_scrub(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa = ztest_spa;
 
 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
 	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
 }
 
 /*
  * Change the guid for the pool.
  */
 /* ARGSUSED */
 void
 ztest_reguid(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa = ztest_spa;
 	uint64_t orig, load;
 	int error;
 
 	orig = spa_guid(spa);
 	load = spa_load_guid(spa);
 
 	(void) rw_wrlock(&ztest_name_lock);
 	error = spa_change_guid(spa);
 	(void) rw_unlock(&ztest_name_lock);
 
 	if (error != 0)
 		return;
 
 	if (ztest_opts.zo_verbose >= 4) {
 		(void) printf("Changed guid old %llu -> %llu\n",
 		    (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
 	}
 
 	VERIFY3U(orig, !=, spa_guid(spa));
 	VERIFY3U(load, ==, spa_load_guid(spa));
 }
 
 /*
  * Rename the pool to a different name and then rename it back.
  */
 /* ARGSUSED */
 void
 ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
 {
 	char *oldname, *newname;
 	spa_t *spa;
 
 	(void) rw_wrlock(&ztest_name_lock);
 
 	oldname = ztest_opts.zo_pool;
 	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
 	(void) strcpy(newname, oldname);
 	(void) strcat(newname, "_tmp");
 
 	/*
 	 * Do the rename
 	 */
 	VERIFY3U(0, ==, spa_rename(oldname, newname));
 
 	/*
 	 * Try to open it under the old name, which shouldn't exist
 	 */
 	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Open it under the new name and make sure it's still the same spa_t.
 	 */
 	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
 
 	ASSERT(spa == ztest_spa);
 	spa_close(spa, FTAG);
 
 	/*
 	 * Rename it back to the original
 	 */
 	VERIFY3U(0, ==, spa_rename(newname, oldname));
 
 	/*
 	 * Make sure it can still be opened
 	 */
 	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
 
 	ASSERT(spa == ztest_spa);
 	spa_close(spa, FTAG);
 
 	umem_free(newname, strlen(newname) + 1);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
  * Verify pool integrity by running zdb.
  */
 static void
 ztest_run_zdb(char *pool)
 {
 	int status;
 	char zdb[MAXPATHLEN + MAXNAMELEN + 20];
 	char zbuf[1024];
 	char *bin;
 	char *ztest;
 	char *isa;
 	int isalen;
 	FILE *fp;
 
 	strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb));
 
 	/* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
 	bin = strstr(zdb, "/usr/bin/");
 	ztest = strstr(bin, "/ztest");
 	isa = bin + 8;
 	isalen = ztest - isa;
 	isa = strdup(isa);
 	/* LINTED */
 	(void) sprintf(bin,
 	    "/usr/sbin%.*s/zdb -bcc%s%s -d -U %s %s",
 	    isalen,
 	    isa,
 	    ztest_opts.zo_verbose >= 3 ? "s" : "",
 	    ztest_opts.zo_verbose >= 4 ? "v" : "",
 	    spa_config_path,
 	    pool);
 	free(isa);
 
 	if (ztest_opts.zo_verbose >= 5)
 		(void) printf("Executing %s\n", strstr(zdb, "zdb "));
 
 	fp = popen(zdb, "r");
 	assert(fp != NULL);
 
 	while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
 		if (ztest_opts.zo_verbose >= 3)
 			(void) printf("%s", zbuf);
 
 	status = pclose(fp);
 
 	if (status == 0)
 		return;
 
 	ztest_dump_core = 0;
 	if (WIFEXITED(status))
 		fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
 	else
 		fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
 }
 
 static void
 ztest_walk_pool_directory(char *header)
 {
 	spa_t *spa = NULL;
 
 	if (ztest_opts.zo_verbose >= 6)
 		(void) printf("%s\n", header);
 
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL)
 		if (ztest_opts.zo_verbose >= 6)
 			(void) printf("\t%s\n", spa_name(spa));
 	mutex_exit(&spa_namespace_lock);
 }
 
 static void
 ztest_spa_import_export(char *oldname, char *newname)
 {
 	nvlist_t *config, *newconfig;
 	uint64_t pool_guid;
 	spa_t *spa;
 	int error;
 
 	if (ztest_opts.zo_verbose >= 4) {
 		(void) printf("import/export: old = %s, new = %s\n",
 		    oldname, newname);
 	}
 
 	/*
 	 * Clean up from previous runs.
 	 */
 	(void) spa_destroy(newname);
 
 	/*
 	 * Get the pool's configuration and guid.
 	 */
 	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Kick off a scrub to tickle scrub/export races.
 	 */
 	if (ztest_random(2) == 0)
 		(void) spa_scan(spa, POOL_SCAN_SCRUB);
 
 	pool_guid = spa_guid(spa);
 	spa_close(spa, FTAG);
 
 	ztest_walk_pool_directory("pools before export");
 
 	/*
 	 * Export it.
 	 */
 	VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
 
 	ztest_walk_pool_directory("pools after export");
 
 	/*
 	 * Try to import it.
 	 */
 	newconfig = spa_tryimport(config);
 	ASSERT(newconfig != NULL);
 	nvlist_free(newconfig);
 
 	/*
 	 * Import it under the new name.
 	 */
 	error = spa_import(newname, config, NULL, 0);
 	if (error != 0) {
 		dump_nvlist(config, 0);
 		fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
 		    oldname, newname, error);
 	}
 
 	ztest_walk_pool_directory("pools after import");
 
 	/*
 	 * Try to import it again -- should fail with EEXIST.
 	 */
 	VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
 
 	/*
 	 * Try to import it under a different name -- should fail with EEXIST.
 	 */
 	VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
 
 	/*
 	 * Verify that the pool is no longer visible under the old name.
 	 */
 	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Verify that we can open and close the pool using the new name.
 	 */
 	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
 	ASSERT(pool_guid == spa_guid(spa));
 	spa_close(spa, FTAG);
 
 	nvlist_free(config);
 }
 
 static void
 ztest_resume(spa_t *spa)
 {
 	if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
 		(void) printf("resuming from suspended state\n");
 	spa_vdev_state_enter(spa, SCL_NONE);
 	vdev_clear(spa, NULL);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 	(void) zio_resume(spa);
 }
 
 static void *
 ztest_resume_thread(void *arg)
 {
 	spa_t *spa = arg;
 
 	while (!ztest_exiting) {
 		if (spa_suspended(spa))
 			ztest_resume(spa);
 		(void) poll(NULL, 0, 100);
 	}
 	return (NULL);
 }
 
 static void *
 ztest_deadman_thread(void *arg)
 {
 	ztest_shared_t *zs = arg;
 	spa_t *spa = ztest_spa;
 	hrtime_t delta, total = 0;
 
 	for (;;) {
 		delta = zs->zs_thread_stop - zs->zs_thread_start +
 		    MSEC2NSEC(zfs_deadman_synctime_ms);
 
 		(void) poll(NULL, 0, (int)NSEC2MSEC(delta));
 
 		/*
 		 * If the pool is suspended then fail immediately. Otherwise,
 		 * check to see if the pool is making any progress. If
 		 * vdev_deadman() discovers that there hasn't been any recent
 		 * I/Os then it will end up aborting the tests.
 		 */
 		if (spa_suspended(spa) || spa->spa_root_vdev == NULL) {
 			fatal(0, "aborting test after %llu seconds because "
 			    "pool has transitioned to a suspended state.",
 			    zfs_deadman_synctime_ms / 1000);
 			return (NULL);
 		}
 		vdev_deadman(spa->spa_root_vdev);
 
 		total += zfs_deadman_synctime_ms/1000;
 		(void) printf("ztest has been running for %lld seconds\n",
 		    total);
 	}
 }
 
 static void
 ztest_execute(int test, ztest_info_t *zi, uint64_t id)
 {
 	ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
 	ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
 	hrtime_t functime = gethrtime();
 
 	for (int i = 0; i < zi->zi_iters; i++)
 		zi->zi_func(zd, id);
 
 	functime = gethrtime() - functime;
 
 	atomic_add_64(&zc->zc_count, 1);
 	atomic_add_64(&zc->zc_time, functime);
 
 	if (ztest_opts.zo_verbose >= 4) {
 		Dl_info dli;
 		(void) dladdr((void *)zi->zi_func, &dli);
 		(void) printf("%6.2f sec in %s\n",
 		    (double)functime / NANOSEC, dli.dli_sname);
 	}
 }
 
 static void *
 ztest_thread(void *arg)
 {
 	int rand;
 	uint64_t id = (uintptr_t)arg;
 	ztest_shared_t *zs = ztest_shared;
 	uint64_t call_next;
 	hrtime_t now;
 	ztest_info_t *zi;
 	ztest_shared_callstate_t *zc;
 
 	while ((now = gethrtime()) < zs->zs_thread_stop) {
 		/*
 		 * See if it's time to force a crash.
 		 */
 		if (now > zs->zs_thread_kill)
 			ztest_kill(zs);
 
 		/*
 		 * If we're getting ENOSPC with some regularity, stop.
 		 */
 		if (zs->zs_enospc_count > 10)
 			break;
 
 		/*
 		 * Pick a random function to execute.
 		 */
 		rand = ztest_random(ZTEST_FUNCS);
 		zi = &ztest_info[rand];
 		zc = ZTEST_GET_SHARED_CALLSTATE(rand);
 		call_next = zc->zc_next;
 
 		if (now >= call_next &&
 		    atomic_cas_64(&zc->zc_next, call_next, call_next +
 		    ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
 			ztest_execute(rand, zi, id);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 ztest_dataset_name(char *dsname, char *pool, int d)
 {
 	(void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
 }
 
 static void
 ztest_dataset_destroy(int d)
 {
 	char name[MAXNAMELEN];
 
 	ztest_dataset_name(name, ztest_opts.zo_pool, d);
 
 	if (ztest_opts.zo_verbose >= 3)
 		(void) printf("Destroying %s to free up space\n", name);
 
 	/*
 	 * Cleanup any non-standard clones and snapshots.  In general,
 	 * ztest thread t operates on dataset (t % zopt_datasets),
 	 * so there may be more than one thing to clean up.
 	 */
 	for (int t = d; t < ztest_opts.zo_threads;
 	    t += ztest_opts.zo_datasets) {
 		ztest_dsl_dataset_cleanup(name, t);
 	}
 
 	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
 	    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 }
 
 static void
 ztest_dataset_dirobj_verify(ztest_ds_t *zd)
 {
 	uint64_t usedobjs, dirobjs, scratch;
 
 	/*
 	 * ZTEST_DIROBJ is the object directory for the entire dataset.
 	 * Therefore, the number of objects in use should equal the
 	 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
 	 * If not, we have an object leak.
 	 *
 	 * Note that we can only check this in ztest_dataset_open(),
 	 * when the open-context and syncing-context values agree.
 	 * That's because zap_count() returns the open-context value,
 	 * while dmu_objset_space() returns the rootbp fill count.
 	 */
 	VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
 	dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
 	ASSERT3U(dirobjs + 1, ==, usedobjs);
 }
 
 static int
 ztest_dataset_open(int d)
 {
 	ztest_ds_t *zd = &ztest_ds[d];
 	uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
 	objset_t *os;
 	zilog_t *zilog;
 	char name[MAXNAMELEN];
 	int error;
 
 	ztest_dataset_name(name, ztest_opts.zo_pool, d);
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	error = ztest_dataset_create(name);
 	if (error == ENOSPC) {
 		(void) rw_unlock(&ztest_name_lock);
 		ztest_record_enospc(FTAG);
 		return (error);
 	}
 	ASSERT(error == 0 || error == EEXIST);
 
 	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
 	(void) rw_unlock(&ztest_name_lock);
 
 	ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
 
 	zilog = zd->zd_zilog;
 
 	if (zilog->zl_header->zh_claim_lr_seq != 0 &&
 	    zilog->zl_header->zh_claim_lr_seq < committed_seq)
 		fatal(0, "missing log records: claimed %llu < committed %llu",
 		    zilog->zl_header->zh_claim_lr_seq, committed_seq);
 
 	ztest_dataset_dirobj_verify(zd);
 
 	zil_replay(os, zd, ztest_replay_vector);
 
 	ztest_dataset_dirobj_verify(zd);
 
 	if (ztest_opts.zo_verbose >= 6)
 		(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
 		    zd->zd_name,
 		    (u_longlong_t)zilog->zl_parse_blk_count,
 		    (u_longlong_t)zilog->zl_parse_lr_count,
 		    (u_longlong_t)zilog->zl_replaying_seq);
 
 	zilog = zil_open(os, ztest_get_data);
 
 	if (zilog->zl_replaying_seq != 0 &&
 	    zilog->zl_replaying_seq < committed_seq)
 		fatal(0, "missing log records: replayed %llu < committed %llu",
 		    zilog->zl_replaying_seq, committed_seq);
 
 	return (0);
 }
 
 static void
 ztest_dataset_close(int d)
 {
 	ztest_ds_t *zd = &ztest_ds[d];
 
 	zil_close(zd->zd_zilog);
 	dmu_objset_disown(zd->zd_os, zd);
 
 	ztest_zd_fini(zd);
 }
 
 /*
  * Kick off threads to run tests on all datasets in parallel.
  */
 static void
 ztest_run(ztest_shared_t *zs)
 {
 	thread_t *tid;
 	spa_t *spa;
 	objset_t *os;
 	thread_t resume_tid;
 	int error;
 
 	ztest_exiting = B_FALSE;
 
 	/*
 	 * Initialize parent/child shared state.
 	 */
 	VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
 	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
 
 	zs->zs_thread_start = gethrtime();
 	zs->zs_thread_stop =
 	    zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
 	zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
 	zs->zs_thread_kill = zs->zs_thread_stop;
 	if (ztest_random(100) < ztest_opts.zo_killrate) {
 		zs->zs_thread_kill -=
 		    ztest_random(ztest_opts.zo_passtime * NANOSEC);
 	}
 
 	(void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
 
 	list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
 	    offsetof(ztest_cb_data_t, zcd_node));
 
 	/*
 	 * Open our pool.
 	 */
 	kernel_init(FREAD | FWRITE);
 	VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	spa->spa_debug = B_TRUE;
 	metaslab_preload_limit = ztest_random(20) + 1;
 	ztest_spa = spa;
 
 	VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
 	    DMU_OST_ANY, B_TRUE, FTAG, &os));
 	zs->zs_guid = dmu_objset_fsid_guid(os);
 	dmu_objset_disown(os, FTAG);
 
 	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
 
 	/*
 	 * We don't expect the pool to suspend unless maxfaults == 0,
 	 * in which case ztest_fault_inject() temporarily takes away
 	 * the only valid replica.
 	 */
 	if (MAXFAULTS() == 0)
 		spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
 	else
 		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
 
 	/*
 	 * Create a thread to periodically resume suspended I/O.
 	 */
 	VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
 	    &resume_tid) == 0);
 
 	/*
 	 * Create a deadman thread to abort() if we hang.
 	 */
 	VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
 	    NULL) == 0);
 
 	/*
 	 * Verify that we can safely inquire about about any object,
 	 * whether it's allocated or not.  To make it interesting,
 	 * we probe a 5-wide window around each power of two.
 	 * This hits all edge cases, including zero and the max.
 	 */
 	for (int t = 0; t < 64; t++) {
 		for (int d = -5; d <= 5; d++) {
 			error = dmu_object_info(spa->spa_meta_objset,
 			    (1ULL << t) + d, NULL);
 			ASSERT(error == 0 || error == ENOENT ||
 			    error == EINVAL);
 		}
 	}
 
 	/*
 	 * If we got any ENOSPC errors on the previous run, destroy something.
 	 */
 	if (zs->zs_enospc_count != 0) {
 		int d = ztest_random(ztest_opts.zo_datasets);
 		ztest_dataset_destroy(d);
 	}
 	zs->zs_enospc_count = 0;
 
 	tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t),
 	    UMEM_NOFAIL);
 
 	if (ztest_opts.zo_verbose >= 4)
 		(void) printf("starting main threads...\n");
 
 	/*
 	 * Kick off all the tests that run in parallel.
 	 */
 	for (int t = 0; t < ztest_opts.zo_threads; t++) {
 		if (t < ztest_opts.zo_datasets &&
 		    ztest_dataset_open(t) != 0)
 			return;
 		VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
 		    THR_BOUND, &tid[t]) == 0);
 	}
 
 	/*
 	 * Wait for all of the tests to complete.  We go in reverse order
 	 * so we don't close datasets while threads are still using them.
 	 */
 	for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
 		VERIFY(thr_join(tid[t], NULL, NULL) == 0);
 		if (t < ztest_opts.zo_datasets)
 			ztest_dataset_close(t);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
 	zfs_dbgmsg_print(FTAG);
 
 	umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));
 
 	/* Kill the resume thread */
 	ztest_exiting = B_TRUE;
 	VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
 	ztest_resume(spa);
 
 	/*
 	 * Right before closing the pool, kick off a bunch of async I/O;
 	 * spa_close() should wait for it to complete.
 	 */
 	for (uint64_t object = 1; object < 50; object++) {
 		dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	spa_close(spa, FTAG);
 
 	/*
 	 * Verify that we can loop over all pools.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
 		if (ztest_opts.zo_verbose > 3)
 			(void) printf("spa_next: found %s\n", spa_name(spa));
 	mutex_exit(&spa_namespace_lock);
 
 	/*
 	 * Verify that we can export the pool and reimport it under a
 	 * different name.
 	 */
 	if (ztest_random(2) == 0) {
 		char name[MAXNAMELEN];
 		(void) snprintf(name, MAXNAMELEN, "%s_import",
 		    ztest_opts.zo_pool);
 		ztest_spa_import_export(ztest_opts.zo_pool, name);
 		ztest_spa_import_export(name, ztest_opts.zo_pool);
 	}
 
 	kernel_fini();
 
 	list_destroy(&zcl.zcl_callbacks);
 
 	(void) _mutex_destroy(&zcl.zcl_callbacks_lock);
 
 	(void) rwlock_destroy(&ztest_name_lock);
 	(void) _mutex_destroy(&ztest_vdev_lock);
 }
 
 static void
 ztest_freeze(void)
 {
 	ztest_ds_t *zd = &ztest_ds[0];
 	spa_t *spa;
 	int numloops = 0;
 
 	if (ztest_opts.zo_verbose >= 3)
 		(void) printf("testing spa_freeze()...\n");
 
 	kernel_init(FREAD | FWRITE);
 	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	VERIFY3U(0, ==, ztest_dataset_open(0));
 	spa->spa_debug = B_TRUE;
 	ztest_spa = spa;
 
 	/*
 	 * Force the first log block to be transactionally allocated.
 	 * We have to do this before we freeze the pool -- otherwise
 	 * the log chain won't be anchored.
 	 */
 	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
 		ztest_dmu_object_alloc_free(zd, 0);
 		zil_commit(zd->zd_zilog, 0);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	/*
 	 * Freeze the pool.  This stops spa_sync() from doing anything,
 	 * so that the only way to record changes from now on is the ZIL.
 	 */
 	spa_freeze(spa);
 
 	/*
 	 * Because it is hard to predict how much space a write will actually
 	 * require beforehand, we leave ourselves some fudge space to write over
 	 * capacity.
 	 */
 	uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
 
 	/*
 	 * Run tests that generate log records but don't alter the pool config
 	 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
 	 * We do a txg_wait_synced() after each iteration to force the txg
 	 * to increase well beyond the last synced value in the uberblock.
 	 * The ZIL should be OK with that.
 	 *
 	 * Run a random number of times less than zo_maxloops and ensure we do
 	 * not run out of space on the pool.
 	 */
 	while (ztest_random(10) != 0 &&
 	    numloops++ < ztest_opts.zo_maxloops &&
 	    metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
 		ztest_od_t od;
 		ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
 		VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
 		ztest_io(zd, od.od_object,
 		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 	}
 
 	/*
 	 * Commit all of the changes we just generated.
 	 */
 	zil_commit(zd->zd_zilog, 0);
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	/*
 	 * Close our dataset and close the pool.
 	 */
 	ztest_dataset_close(0);
 	spa_close(spa, FTAG);
 	kernel_fini();
 
 	/*
 	 * Open and close the pool and dataset to induce log replay.
 	 */
 	kernel_init(FREAD | FWRITE);
 	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
 	VERIFY3U(0, ==, ztest_dataset_open(0));
 	ztest_dataset_close(0);
 
 	spa->spa_debug = B_TRUE;
 	ztest_spa = spa;
 	txg_wait_synced(spa_get_dsl(spa), 0);
 	ztest_reguid(NULL, 0);
 
 	spa_close(spa, FTAG);
 	kernel_fini();
 }
 
 void
 print_time(hrtime_t t, char *timebuf)
 {
 	hrtime_t s = t / NANOSEC;
 	hrtime_t m = s / 60;
 	hrtime_t h = m / 60;
 	hrtime_t d = h / 24;
 
 	s -= m * 60;
 	m -= h * 60;
 	h -= d * 24;
 
 	timebuf[0] = '\0';
 
 	if (d)
 		(void) sprintf(timebuf,
 		    "%llud%02lluh%02llum%02llus", d, h, m, s);
 	else if (h)
 		(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
 	else if (m)
 		(void) sprintf(timebuf, "%llum%02llus", m, s);
 	else
 		(void) sprintf(timebuf, "%llus", s);
 }
 
 static nvlist_t *
 make_random_props()
 {
 	nvlist_t *props;
 
 	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
 	if (ztest_random(2) == 0)
 		return (props);
 	VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
 
 	return (props);
 }
 
 /*
  * Create a storage pool with the given name and initial vdev size.
  * Then test spa_freeze() functionality.
  */
 static void
 ztest_init(ztest_shared_t *zs)
 {
 	spa_t *spa;
 	nvlist_t *nvroot, *props;
 
 	VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
 	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
 
 	kernel_init(FREAD | FWRITE);
 
 	/*
 	 * Create the storage pool.
 	 */
 	(void) spa_destroy(ztest_opts.zo_pool);
 	ztest_shared->zs_vdev_next_leaf = 0;
 	zs->zs_splits = 0;
 	zs->zs_mirrors = ztest_opts.zo_mirrors;
 	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
 	    0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
 	props = make_random_props();
 	for (int i = 0; i < SPA_FEATURES; i++) {
 		char buf[1024];
 		(void) snprintf(buf, sizeof (buf), "feature@%s",
 		    spa_feature_table[i].fi_uname);
 		VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
 	}
 	VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL));
 	nvlist_free(nvroot);
 	nvlist_free(props);
 
 	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	zs->zs_metaslab_sz =
 	    1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
 
 	spa_close(spa, FTAG);
 
 	kernel_fini();
 
 	ztest_run_zdb(ztest_opts.zo_pool);
 
 	ztest_freeze();
 
 	ztest_run_zdb(ztest_opts.zo_pool);
 
 	(void) rwlock_destroy(&ztest_name_lock);
 	(void) _mutex_destroy(&ztest_vdev_lock);
 }
 
 static void
 setup_data_fd(void)
 {
 	static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
 
 	ztest_fd_data = mkstemp(ztest_name_data);
 	ASSERT3S(ztest_fd_data, >=, 0);
 	(void) unlink(ztest_name_data);
 }
 
 
 static int
 shared_data_size(ztest_shared_hdr_t *hdr)
 {
 	int size;
 
 	size = hdr->zh_hdr_size;
 	size += hdr->zh_opts_size;
 	size += hdr->zh_size;
 	size += hdr->zh_stats_size * hdr->zh_stats_count;
 	size += hdr->zh_ds_size * hdr->zh_ds_count;
 
 	return (size);
 }
 
 static void
 setup_hdr(void)
 {
 	int size;
 	ztest_shared_hdr_t *hdr;
 
 	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
 	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
 	ASSERT(hdr != MAP_FAILED);
 
 	VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
 
 	hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
 	hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
 	hdr->zh_size = sizeof (ztest_shared_t);
 	hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
 	hdr->zh_stats_count = ZTEST_FUNCS;
 	hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
 	hdr->zh_ds_count = ztest_opts.zo_datasets;
 
 	size = shared_data_size(hdr);
 	VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
 
 	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
 }
 
 static void
 setup_data(void)
 {
 	int size, offset;
 	ztest_shared_hdr_t *hdr;
 	uint8_t *buf;
 
 	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
 	    PROT_READ, MAP_SHARED, ztest_fd_data, 0);
 	ASSERT(hdr != MAP_FAILED);
 
 	size = shared_data_size(hdr);
 
 	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
 	hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
 	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
 	ASSERT(hdr != MAP_FAILED);
 	buf = (uint8_t *)hdr;
 
 	offset = hdr->zh_hdr_size;
 	ztest_shared_opts = (void *)&buf[offset];
 	offset += hdr->zh_opts_size;
 	ztest_shared = (void *)&buf[offset];
 	offset += hdr->zh_size;
 	ztest_shared_callstate = (void *)&buf[offset];
 	offset += hdr->zh_stats_size * hdr->zh_stats_count;
 	ztest_shared_ds = (void *)&buf[offset];
 }
 
 static boolean_t
 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp)
 {
 	pid_t pid;
 	int status;
 	char *cmdbuf = NULL;
 
 	pid = fork();
 
 	if (cmd == NULL) {
 		cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 		(void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
 		cmd = cmdbuf;
 	}
 
 	if (pid == -1)
 		fatal(1, "fork failed");
 
 	if (pid == 0) {	/* child */
 		char *emptyargv[2] = { cmd, NULL };
 		char fd_data_str[12];
 
 		struct rlimit rl = { 1024, 1024 };
 		(void) setrlimit(RLIMIT_NOFILE, &rl);
 
 		(void) close(ztest_fd_rand);
 		VERIFY3U(11, >=,
 		    snprintf(fd_data_str, 12, "%d", ztest_fd_data));
 		VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1));
 
 		(void) enable_extended_FILE_stdio(-1, -1);
 		if (libpath != NULL)
 			VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
 #ifdef illumos
 		(void) execv(cmd, emptyargv);
 #else
 		(void) execvp(cmd, emptyargv);
 #endif
 		ztest_dump_core = B_FALSE;
 		fatal(B_TRUE, "exec failed: %s", cmd);
 	}
 
 	if (cmdbuf != NULL) {
 		umem_free(cmdbuf, MAXPATHLEN);
 		cmd = NULL;
 	}
 
 	while (waitpid(pid, &status, 0) != pid)
 		continue;
 	if (statusp != NULL)
 		*statusp = status;
 
 	if (WIFEXITED(status)) {
 		if (WEXITSTATUS(status) != 0) {
 			(void) fprintf(stderr, "child exited with code %d\n",
 			    WEXITSTATUS(status));
 			exit(2);
 		}
 		return (B_FALSE);
 	} else if (WIFSIGNALED(status)) {
 		if (!ignorekill || WTERMSIG(status) != SIGKILL) {
 			(void) fprintf(stderr, "child died with signal %d\n",
 			    WTERMSIG(status));
 			exit(3);
 		}
 		return (B_TRUE);
 	} else {
 		(void) fprintf(stderr, "something strange happened to child\n");
 		exit(4);
 		/* NOTREACHED */
 	}
 }
 
 static void
 ztest_run_init(void)
 {
 	ztest_shared_t *zs = ztest_shared;
 
 	ASSERT(ztest_opts.zo_init != 0);
 
 	/*
 	 * Blow away any existing copy of zpool.cache
 	 */
 	(void) remove(spa_config_path);
 
 	/*
 	 * Create and initialize our storage pool.
 	 */
 	for (int i = 1; i <= ztest_opts.zo_init; i++) {
 		bzero(zs, sizeof (ztest_shared_t));
 		if (ztest_opts.zo_verbose >= 3 &&
 		    ztest_opts.zo_init != 1) {
 			(void) printf("ztest_init(), pass %d\n", i);
 		}
 		ztest_init(zs);
 	}
 }
 
 int
 main(int argc, char **argv)
 {
 	int kills = 0;
 	int iters = 0;
 	int older = 0;
 	int newer = 0;
 	ztest_shared_t *zs;
 	ztest_info_t *zi;
 	ztest_shared_callstate_t *zc;
 	char timebuf[100];
 	char numbuf[6];
 	spa_t *spa;
 	char *cmd;
 	boolean_t hasalt;
 	char *fd_data_str = getenv("ZTEST_FD_DATA");
 
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
 	dprintf_setup(&argc, argv);
 	zfs_deadman_synctime_ms = 300000;
 
 	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
 	ASSERT3S(ztest_fd_rand, >=, 0);
 
 	if (!fd_data_str) {
 		process_options(argc, argv);
 
 		setup_data_fd();
 		setup_hdr();
 		setup_data();
 		bcopy(&ztest_opts, ztest_shared_opts,
 		    sizeof (*ztest_shared_opts));
 	} else {
 		ztest_fd_data = atoi(fd_data_str);
 		setup_data();
 		bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
 	}
 	ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
 
 	/* Override location of zpool.cache */
 	VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache",
 	    ztest_opts.zo_dir), !=, -1);
 
 	ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
 	    UMEM_NOFAIL);
 	zs = ztest_shared;
 
 	if (fd_data_str) {
 		metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
 		metaslab_df_alloc_threshold =
 		    zs->zs_metaslab_df_alloc_threshold;
 
 		if (zs->zs_do_init)
 			ztest_run_init();
 		else
 			ztest_run(zs);
 		exit(0);
 	}
 
 	hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
 
 	if (ztest_opts.zo_verbose >= 1) {
 		(void) printf("%llu vdevs, %d datasets, %d threads,"
 		    " %llu seconds...\n",
 		    (u_longlong_t)ztest_opts.zo_vdevs,
 		    ztest_opts.zo_datasets,
 		    ztest_opts.zo_threads,
 		    (u_longlong_t)ztest_opts.zo_time);
 	}
 
 	cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
 	(void) strlcpy(cmd, getexecname(), MAXNAMELEN);
 
 	zs->zs_do_init = B_TRUE;
 	if (strlen(ztest_opts.zo_alt_ztest) != 0) {
 		if (ztest_opts.zo_verbose >= 1) {
 			(void) printf("Executing older ztest for "
 			    "initialization: %s\n", ztest_opts.zo_alt_ztest);
 		}
 		VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
 		    ztest_opts.zo_alt_libpath, B_FALSE, NULL));
 	} else {
 		VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
 	}
 	zs->zs_do_init = B_FALSE;
 
 	zs->zs_proc_start = gethrtime();
 	zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;
 
 	for (int f = 0; f < ZTEST_FUNCS; f++) {
 		zi = &ztest_info[f];
 		zc = ZTEST_GET_SHARED_CALLSTATE(f);
 		if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
 			zc->zc_next = UINT64_MAX;
 		else
 			zc->zc_next = zs->zs_proc_start +
 			    ztest_random(2 * zi->zi_interval[0] + 1);
 	}
 
 	/*
 	 * Run the tests in a loop.  These tests include fault injection
 	 * to verify that self-healing data works, and forced crashes
 	 * to verify that we never lose on-disk consistency.
 	 */
 	while (gethrtime() < zs->zs_proc_stop) {
 		int status;
 		boolean_t killed;
 
 		/*
 		 * Initialize the workload counters for each function.
 		 */
 		for (int f = 0; f < ZTEST_FUNCS; f++) {
 			zc = ZTEST_GET_SHARED_CALLSTATE(f);
 			zc->zc_count = 0;
 			zc->zc_time = 0;
 		}
 
 		/* Set the allocation switch size */
 		zs->zs_metaslab_df_alloc_threshold =
 		    ztest_random(zs->zs_metaslab_sz / 4) + 1;
 
 		if (!hasalt || ztest_random(2) == 0) {
 			if (hasalt && ztest_opts.zo_verbose >= 1) {
 				(void) printf("Executing newer ztest: %s\n",
 				    cmd);
 			}
 			newer++;
 			killed = exec_child(cmd, NULL, B_TRUE, &status);
 		} else {
 			if (hasalt && ztest_opts.zo_verbose >= 1) {
 				(void) printf("Executing older ztest: %s\n",
 				    ztest_opts.zo_alt_ztest);
 			}
 			older++;
 			killed = exec_child(ztest_opts.zo_alt_ztest,
 			    ztest_opts.zo_alt_libpath, B_TRUE, &status);
 		}
 
 		if (killed)
 			kills++;
 		iters++;
 
 		if (ztest_opts.zo_verbose >= 1) {
 			hrtime_t now = gethrtime();
 
 			now = MIN(now, zs->zs_proc_stop);
 			print_time(zs->zs_proc_stop - now, timebuf);
 			nicenum(zs->zs_space, numbuf);
 
 			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
 			    "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
 			    iters,
 			    WIFEXITED(status) ? "Complete" : "SIGKILL",
 			    (u_longlong_t)zs->zs_enospc_count,
 			    100.0 * zs->zs_alloc / zs->zs_space,
 			    numbuf,
 			    100.0 * (now - zs->zs_proc_start) /
 			    (ztest_opts.zo_time * NANOSEC), timebuf);
 		}
 
 		if (ztest_opts.zo_verbose >= 2) {
 			(void) printf("\nWorkload summary:\n\n");
 			(void) printf("%7s %9s   %s\n",
 			    "Calls", "Time", "Function");
 			(void) printf("%7s %9s   %s\n",
 			    "-----", "----", "--------");
 			for (int f = 0; f < ZTEST_FUNCS; f++) {
 				Dl_info dli;
 
 				zi = &ztest_info[f];
 				zc = ZTEST_GET_SHARED_CALLSTATE(f);
 				print_time(zc->zc_time, timebuf);
 				(void) dladdr((void *)zi->zi_func, &dli);
 				(void) printf("%7llu %9s   %s\n",
 				    (u_longlong_t)zc->zc_count, timebuf,
 				    dli.dli_sname);
 			}
 			(void) printf("\n");
 		}
 
 		/*
 		 * It's possible that we killed a child during a rename test,
 		 * in which case we'll have a 'ztest_tmp' pool lying around
 		 * instead of 'ztest'.  Do a blind rename in case this happened.
 		 */
 		kernel_init(FREAD);
 		if (spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0) {
 			spa_close(spa, FTAG);
 		} else {
 			char tmpname[MAXNAMELEN];
 			kernel_fini();
 			kernel_init(FREAD | FWRITE);
 			(void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
 			    ztest_opts.zo_pool);
 			(void) spa_rename(tmpname, ztest_opts.zo_pool);
 		}
 		kernel_fini();
 
 		ztest_run_zdb(ztest_opts.zo_pool);
 	}
 
 	if (ztest_opts.zo_verbose >= 1) {
 		if (hasalt) {
 			(void) printf("%d runs of older ztest: %s\n", older,
 			    ztest_opts.zo_alt_ztest);
 			(void) printf("%d runs of newer ztest: %s\n", newer,
 			    cmd);
 		}
 		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
 		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
 	}
 
 	umem_free(cmd, MAXNAMELEN);
 
 	return (0);
 }
Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	(revision 296518)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	(revision 296519)
@@ -1,812 +1,813 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_LIBZFS_H
 #define	_LIBZFS_H
 
 #include <assert.h>
 #include <libnvpair.h>
 #include <sys/mnttab.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/avl.h>
 #include <sys/zfs_ioctl.h>
 #include <libzfs_core.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Miscellaneous ZFS constants
  */
 #define	ZFS_MAXNAMELEN		MAXNAMELEN
 #define	ZPOOL_MAXNAMELEN	MAXNAMELEN
 #define	ZFS_MAXPROPLEN		MAXPATHLEN
 #define	ZPOOL_MAXPROPLEN	MAXPATHLEN
 
 /*
  * libzfs errors
  */
 typedef enum zfs_error {
 	EZFS_SUCCESS = 0,	/* no error -- success */
 	EZFS_NOMEM = 2000,	/* out of memory */
 	EZFS_BADPROP,		/* invalid property value */
 	EZFS_PROPREADONLY,	/* cannot set readonly property */
 	EZFS_PROPTYPE,		/* property does not apply to dataset type */
 	EZFS_PROPNONINHERIT,	/* property is not inheritable */
 	EZFS_PROPSPACE,		/* bad quota or reservation */
 	EZFS_BADTYPE,		/* dataset is not of appropriate type */
 	EZFS_BUSY,		/* pool or dataset is busy */
 	EZFS_EXISTS,		/* pool or dataset already exists */
 	EZFS_NOENT,		/* no such pool or dataset */
 	EZFS_BADSTREAM,		/* bad backup stream */
 	EZFS_DSREADONLY,	/* dataset is readonly */
 	EZFS_VOLTOOBIG,		/* volume is too large for 32-bit system */
 	EZFS_INVALIDNAME,	/* invalid dataset name */
 	EZFS_BADRESTORE,	/* unable to restore to destination */
 	EZFS_BADBACKUP,		/* backup failed */
 	EZFS_BADTARGET,		/* bad attach/detach/replace target */
 	EZFS_NODEVICE,		/* no such device in pool */
 	EZFS_BADDEV,		/* invalid device to add */
 	EZFS_NOREPLICAS,	/* no valid replicas */
 	EZFS_RESILVERING,	/* currently resilvering */
 	EZFS_BADVERSION,	/* unsupported version */
 	EZFS_POOLUNAVAIL,	/* pool is currently unavailable */
 	EZFS_DEVOVERFLOW,	/* too many devices in one vdev */
 	EZFS_BADPATH,		/* must be an absolute path */
 	EZFS_CROSSTARGET,	/* rename or clone across pool or dataset */
 	EZFS_ZONED,		/* used improperly in local zone */
 	EZFS_MOUNTFAILED,	/* failed to mount dataset */
 	EZFS_UMOUNTFAILED,	/* failed to unmount dataset */
 	EZFS_UNSHARENFSFAILED,	/* unshare(1M) failed */
 	EZFS_SHARENFSFAILED,	/* share(1M) failed */
 	EZFS_PERM,		/* permission denied */
 	EZFS_NOSPC,		/* out of space */
 	EZFS_FAULT,		/* bad address */
 	EZFS_IO,		/* I/O error */
 	EZFS_INTR,		/* signal received */
 	EZFS_ISSPARE,		/* device is a hot spare */
 	EZFS_INVALCONFIG,	/* invalid vdev configuration */
 	EZFS_RECURSIVE,		/* recursive dependency */
 	EZFS_NOHISTORY,		/* no history object */
 	EZFS_POOLPROPS,		/* couldn't retrieve pool props */
 	EZFS_POOL_NOTSUP,	/* ops not supported for this type of pool */
 	EZFS_POOL_INVALARG,	/* invalid argument for this pool operation */
 	EZFS_NAMETOOLONG,	/* dataset name is too long */
 	EZFS_OPENFAILED,	/* open of device failed */
 	EZFS_NOCAP,		/* couldn't get capacity */
 	EZFS_LABELFAILED,	/* write of label failed */
 	EZFS_BADWHO,		/* invalid permission who */
 	EZFS_BADPERM,		/* invalid permission */
 	EZFS_BADPERMSET,	/* invalid permission set name */
 	EZFS_NODELEGATION,	/* delegated administration is disabled */
 	EZFS_UNSHARESMBFAILED,	/* failed to unshare over smb */
 	EZFS_SHARESMBFAILED,	/* failed to share over smb */
 	EZFS_BADCACHE,		/* bad cache file */
 	EZFS_ISL2CACHE,		/* device is for the level 2 ARC */
 	EZFS_VDEVNOTSUP,	/* unsupported vdev type */
 	EZFS_NOTSUP,		/* ops not supported on this dataset */
 	EZFS_ACTIVE_SPARE,	/* pool has active shared spare devices */
 	EZFS_UNPLAYED_LOGS,	/* log device has unplayed logs */
 	EZFS_REFTAG_RELE,	/* snapshot release: tag not found */
 	EZFS_REFTAG_HOLD,	/* snapshot hold: tag already exists */
 	EZFS_TAGTOOLONG,	/* snapshot hold/rele: tag too long */
 	EZFS_PIPEFAILED,	/* pipe create failed */
 	EZFS_THREADCREATEFAILED, /* thread create failed */
 	EZFS_POSTSPLIT_ONLINE,	/* onlining a disk after splitting it */
 	EZFS_SCRUBBING,		/* currently scrubbing */
 	EZFS_NO_SCRUB,		/* no active scrub */
 	EZFS_DIFF,		/* general failure of zfs diff */
 	EZFS_DIFFDATA,		/* bad zfs diff data */
 	EZFS_POOLREADONLY,	/* pool is in read-only mode */
 	EZFS_UNKNOWN
 } zfs_error_t;
 
 /*
  * The following data structures are all part
  * of the zfs_allow_t data structure which is
  * used for printing 'allow' permissions.
  * It is a linked list of zfs_allow_t's which
  * then contain avl tree's for user/group/sets/...
  * and each one of the entries in those trees have
  * avl tree's for the permissions they belong to and
  * whether they are local,descendent or local+descendent
  * permissions.  The AVL trees are used primarily for
  * sorting purposes, but also so that we can quickly find
  * a given user and or permission.
  */
 typedef struct zfs_perm_node {
 	avl_node_t z_node;
 	char z_pname[MAXPATHLEN];
 } zfs_perm_node_t;
 
 typedef struct zfs_allow_node {
 	avl_node_t z_node;
 	char z_key[MAXPATHLEN];		/* name, such as joe */
 	avl_tree_t z_localdescend;	/* local+descendent perms */
 	avl_tree_t z_local;		/* local permissions */
 	avl_tree_t z_descend;		/* descendent permissions */
 } zfs_allow_node_t;
 
 typedef struct zfs_allow {
 	struct zfs_allow *z_next;
 	char z_setpoint[MAXPATHLEN];
 	avl_tree_t z_sets;
 	avl_tree_t z_crperms;
 	avl_tree_t z_user;
 	avl_tree_t z_group;
 	avl_tree_t z_everyone;
 } zfs_allow_t;
 
 /*
  * Basic handle types
  */
 typedef struct zfs_handle zfs_handle_t;
 typedef struct zpool_handle zpool_handle_t;
 typedef struct libzfs_handle libzfs_handle_t;
 
 /*
  * Library initialization
  */
 extern libzfs_handle_t *libzfs_init(void);
 extern void libzfs_fini(libzfs_handle_t *);
 
 extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *);
 extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *);
 
 extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
 
 extern void zfs_save_arguments(int argc, char **, char *, int);
 extern int zpool_log_history(libzfs_handle_t *, const char *);
 
 extern int libzfs_errno(libzfs_handle_t *);
 extern const char *libzfs_error_action(libzfs_handle_t *);
 extern const char *libzfs_error_description(libzfs_handle_t *);
 extern int zfs_standard_error(libzfs_handle_t *, int, const char *);
 extern void libzfs_mnttab_init(libzfs_handle_t *);
 extern void libzfs_mnttab_fini(libzfs_handle_t *);
 extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t);
 extern int libzfs_mnttab_find(libzfs_handle_t *, const char *,
     struct mnttab *);
 extern void libzfs_mnttab_add(libzfs_handle_t *, const char *,
     const char *, const char *);
 extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *);
 
 /*
  * Basic handle functions
  */
 extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *);
 extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *);
 extern void zpool_close(zpool_handle_t *);
 extern const char *zpool_get_name(zpool_handle_t *);
 extern int zpool_get_state(zpool_handle_t *);
 extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t);
 extern const char *zpool_pool_state_to_name(pool_state_t);
 extern void zpool_free_handles(libzfs_handle_t *);
 
 /*
  * Iterate over all active pools in the system.
  */
 typedef int (*zpool_iter_f)(zpool_handle_t *, void *);
 extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *);
 
 /*
  * Functions to create and destroy pools
  */
 extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
     nvlist_t *, nvlist_t *);
 extern int zpool_destroy(zpool_handle_t *, const char *);
 extern int zpool_add(zpool_handle_t *, nvlist_t *);
 
 typedef struct splitflags {
 	/* do not split, but return the config that would be split off */
 	int dryrun : 1;
 
 	/* after splitting, import the pool */
 	int import : 1;
 } splitflags_t;
 
 /*
  * Functions to manipulate pool and vdev state
  */
 extern int zpool_scan(zpool_handle_t *, pool_scan_func_t);
 extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
 extern int zpool_reguid(zpool_handle_t *);
 extern int zpool_reopen(zpool_handle_t *);
 
 extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
     vdev_state_t *);
 extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
 extern int zpool_vdev_attach(zpool_handle_t *, const char *,
     const char *, nvlist_t *, int);
 extern int zpool_vdev_detach(zpool_handle_t *, const char *);
 extern int zpool_vdev_remove(zpool_handle_t *, const char *);
 extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *,
     splitflags_t);
 
 extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
 extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
 extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);
 
 extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
     boolean_t *, boolean_t *);
 extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
     boolean_t *, boolean_t *, boolean_t *);
 extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *);
 
 /*
  * Functions to manage pool properties
  */
 extern int zpool_set_prop(zpool_handle_t *, const char *, const char *);
 extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *,
     size_t proplen, zprop_source_t *, boolean_t);
 extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t,
     zprop_source_t *);
 
 extern const char *zpool_prop_to_name(zpool_prop_t);
 extern const char *zpool_prop_values(zpool_prop_t);
 
 /*
  * Pool health statistics.
  */
 typedef enum {
 	/*
 	 * The following correspond to faults as defined in the (fault.fs.zfs.*)
 	 * event namespace.  Each is associated with a corresponding message ID.
 	 */
 	ZPOOL_STATUS_CORRUPT_CACHE,	/* corrupt /kernel/drv/zpool.cache */
 	ZPOOL_STATUS_MISSING_DEV_R,	/* missing device with replicas */
 	ZPOOL_STATUS_MISSING_DEV_NR,	/* missing device with no replicas */
 	ZPOOL_STATUS_CORRUPT_LABEL_R,	/* bad device label with replicas */
 	ZPOOL_STATUS_CORRUPT_LABEL_NR,	/* bad device label with no replicas */
 	ZPOOL_STATUS_BAD_GUID_SUM,	/* sum of device guids didn't match */
 	ZPOOL_STATUS_CORRUPT_POOL,	/* pool metadata is corrupted */
 	ZPOOL_STATUS_CORRUPT_DATA,	/* data errors in user (meta)data */
 	ZPOOL_STATUS_FAILING_DEV,	/* device experiencing errors */
 	ZPOOL_STATUS_VERSION_NEWER,	/* newer on-disk version */
 	ZPOOL_STATUS_HOSTID_MISMATCH,	/* last accessed by another system */
 	ZPOOL_STATUS_IO_FAILURE_WAIT,	/* failed I/O, failmode 'wait' */
 	ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */
 	ZPOOL_STATUS_BAD_LOG,		/* cannot read log chain(s) */
 
 	/*
 	 * If the pool has unsupported features but can still be opened in
 	 * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the
 	 * pool has unsupported features but cannot be opened at all, its
 	 * status is ZPOOL_STATUS_UNSUP_FEAT_READ.
 	 */
 	ZPOOL_STATUS_UNSUP_FEAT_READ,	/* unsupported features for read */
 	ZPOOL_STATUS_UNSUP_FEAT_WRITE,	/* unsupported features for write */
 
 	/*
 	 * These faults have no corresponding message ID.  At the time we are
 	 * checking the status, the original reason for the FMA fault (I/O or
 	 * checksum errors) has been lost.
 	 */
 	ZPOOL_STATUS_FAULTED_DEV_R,	/* faulted device with replicas */
 	ZPOOL_STATUS_FAULTED_DEV_NR,	/* faulted device with no replicas */
 
 	/*
 	 * The following are not faults per se, but still an error possibly
 	 * requiring administrative attention.  There is no corresponding
 	 * message ID.
 	 */
 	ZPOOL_STATUS_VERSION_OLDER,	/* older legacy on-disk version */
 	ZPOOL_STATUS_FEAT_DISABLED,	/* supported features are disabled */
 	ZPOOL_STATUS_RESILVERING,	/* device being resilvered */
 	ZPOOL_STATUS_OFFLINE_DEV,	/* device online */
 	ZPOOL_STATUS_REMOVED_DEV,	/* removed device */
 	ZPOOL_STATUS_NON_NATIVE_ASHIFT,	/* (e.g. 512e dev with ashift of 9) */
 
 	/*
 	 * Finally, the following indicates a healthy pool.
 	 */
 	ZPOOL_STATUS_OK
 } zpool_status_t;
 
 extern zpool_status_t zpool_get_status(zpool_handle_t *, char **);
 extern zpool_status_t zpool_import_status(nvlist_t *, char **);
 extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh);
 
 /*
  * Statistics and configuration functions.
  */
 extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
 extern nvlist_t *zpool_get_features(zpool_handle_t *);
 extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *);
 extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
 
 /*
  * Import and export functions
  */
 extern int zpool_export(zpool_handle_t *, boolean_t, const char *);
 extern int zpool_export_force(zpool_handle_t *, const char *);
 extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
     char *altroot);
 extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
     nvlist_t *, int);
 extern void zpool_print_unsup_feat(nvlist_t *config);
 
 /*
  * Search for pools to import
  */
 
 typedef struct importargs {
 	char **path;		/* a list of paths to search		*/
 	int paths;		/* number of paths to search		*/
 	char *poolname;		/* name of a pool to find		*/
 	uint64_t guid;		/* guid of a pool to find		*/
 	char *cachefile;	/* cachefile to use for import		*/
 	int can_be_active : 1;	/* can the pool be active?		*/
 	int unique : 1;		/* does 'poolname' already exist?	*/
 	int exists : 1;		/* set on return if pool already exists	*/
 } importargs_t;
 
 extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);
 
 /* legacy pool search routines */
 extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
 extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *,
     char *, uint64_t);
 
 /*
  * Miscellaneous pool functions
  */
 struct zfs_cmd;
 
 extern const char *zfs_history_event_names[];
 
 extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
     boolean_t verbose);
 extern int zpool_upgrade(zpool_handle_t *, uint64_t);
 extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
 extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
     nvlist_t ***, uint_t *);
 extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
     size_t len);
 extern int zfs_ioctl(libzfs_handle_t *, int request, struct zfs_cmd *);
 extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
 extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
     nvlist_t *);
 
 /*
  * Basic handle manipulations.  These functions do not create or destroy the
  * underlying datasets, only the references to them.
  */
 extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int);
 extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *);
 extern void zfs_close(zfs_handle_t *);
 extern zfs_type_t zfs_get_type(const zfs_handle_t *);
 extern const char *zfs_get_name(const zfs_handle_t *);
 extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *);
 
 /*
  * Property management functions.  Some functions are shared with the kernel,
  * and are found in sys/fs/zfs.h.
  */
 
 /*
  * zfs dataset property management
  */
 extern const char *zfs_prop_default_string(zfs_prop_t);
 extern uint64_t zfs_prop_default_numeric(zfs_prop_t);
 extern const char *zfs_prop_column_name(zfs_prop_t);
 extern boolean_t zfs_prop_align_right(zfs_prop_t);
 
 extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t,
     nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, const char *);
 
 extern const char *zfs_prop_to_name(zfs_prop_t);
 extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
 extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *);
 extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
     zprop_source_t *, char *, size_t, boolean_t);
 extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t,
     boolean_t);
 extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
     zprop_source_t *, char *, size_t);
 extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue);
 extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
     char *propbuf, int proplen, boolean_t literal);
 extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue);
 extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname,
     char *propbuf, int proplen, boolean_t literal);
 extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname,
     char *buf, size_t len);
 extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
 extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t);
 extern const char *zfs_prop_values(zfs_prop_t);
 extern int zfs_prop_is_string(zfs_prop_t prop);
 extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
 extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *);
 extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *);
 
 
 typedef struct zprop_list {
 	int		pl_prop;
 	char		*pl_user_prop;
 	struct zprop_list *pl_next;
 	boolean_t	pl_all;
 	size_t		pl_width;
 	size_t		pl_recvd_width;
 	boolean_t	pl_fixed;
 } zprop_list_t;
 
 extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t,
     boolean_t);
 extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);
 
 #define	ZFS_MOUNTPOINT_NONE	"none"
 #define	ZFS_MOUNTPOINT_LEGACY	"legacy"
 
 #define	ZFS_FEATURE_DISABLED	"disabled"
 #define	ZFS_FEATURE_ENABLED	"enabled"
 #define	ZFS_FEATURE_ACTIVE	"active"
 
 #define	ZFS_UNSUPPORTED_INACTIVE	"inactive"
 #define	ZFS_UNSUPPORTED_READONLY	"readonly"
 
 /*
  * zpool property management
  */
 extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **);
 extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *,
     size_t);
 extern const char *zpool_prop_default_string(zpool_prop_t);
 extern uint64_t zpool_prop_default_numeric(zpool_prop_t);
 extern const char *zpool_prop_column_name(zpool_prop_t);
 extern boolean_t zpool_prop_align_right(zpool_prop_t);
 
 /*
  * Functions shared by zfs and zpool property management.
  */
 extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all,
     boolean_t ordered, zfs_type_t type);
 extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **,
     zfs_type_t);
 extern void zprop_free_list(zprop_list_t *);
 
 #define	ZFS_GET_NCOLS	5
 
 typedef enum {
 	GET_COL_NONE,
 	GET_COL_NAME,
 	GET_COL_PROPERTY,
 	GET_COL_VALUE,
 	GET_COL_RECVD,
 	GET_COL_SOURCE
 } zfs_get_column_t;
 
 /*
  * Functions for printing zfs or zpool properties
  */
 typedef struct zprop_get_cbdata {
 	int cb_sources;
 	zfs_get_column_t cb_columns[ZFS_GET_NCOLS];
 	int cb_colwidths[ZFS_GET_NCOLS + 1];
 	boolean_t cb_scripted;
 	boolean_t cb_literal;
 	boolean_t cb_first;
 	zprop_list_t *cb_proplist;
 	zfs_type_t cb_type;
 } zprop_get_cbdata_t;
 
 void zprop_print_one_property(const char *, zprop_get_cbdata_t *,
     const char *, const char *, zprop_source_t, const char *,
     const char *);
 
 /*
  * Iterator functions.
  */
 typedef int (*zfs_iter_f)(zfs_handle_t *, void *);
 extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
 extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
 extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *);
 extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *);
 
 typedef struct get_all_cb {
 	zfs_handle_t	**cb_handles;
 	size_t		cb_alloc;
 	size_t		cb_used;
 	boolean_t	cb_verbose;
 	int		(*cb_getone)(zfs_handle_t *, void *);
 } get_all_cb_t;
 
 void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
 int libzfs_dataset_cmp(const void *, const void *);
 
 /*
  * Functions to create and destroy datasets.
  */
 extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
     nvlist_t *);
 extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
 extern int zfs_destroy(zfs_handle_t *, boolean_t);
 extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
 extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t);
 extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
 extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps,
     nvlist_t *props);
 extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
 
 typedef struct renameflags {
 	/* recursive rename */
 	int recurse : 1;
 
 	/* don't unmount file systems */
 	int nounmount : 1;
 
 	/* force unmount file systems */
 	int forceunmount : 1;
 } renameflags_t;
 
 extern int zfs_rename(zfs_handle_t *, const char *, const char *,
     renameflags_t flags);
 
 typedef struct sendflags {
 	/* print informational messages (ie, -v was specified) */
 	boolean_t verbose;
 
 	/* recursive send  (ie, -R) */
 	boolean_t replicate;
 
 	/* for incrementals, do all intermediate snapshots */
 	boolean_t doall;
 
 	/* if dataset is a clone, do incremental from its origin */
 	boolean_t fromorigin;
 
 	/* do deduplication */
 	boolean_t dedup;
 
 	/* send properties (ie, -p) */
 	boolean_t props;
 
 	/* do not send (no-op, ie. -n) */
 	boolean_t dryrun;
 
 	/* parsable verbose output (ie. -P) */
 	boolean_t parsable;
 
 	/* show progress (ie. -v) */
 	boolean_t progress;
 
 	/* large blocks (>128K) are permitted */
 	boolean_t largeblock;
 
 	/* WRITE_EMBEDDED records of type DATA are permitted */
 	boolean_t embed_data;
 } sendflags_t;
 
 typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
 
 extern int zfs_send(zfs_handle_t *, const char *, const char *,
     sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
 extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
 extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd,
     const char *);
 extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl,
     const char *token);
 
 extern int zfs_promote(zfs_handle_t *);
 extern int zfs_hold(zfs_handle_t *, const char *, const char *,
     boolean_t, int);
 extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
 extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
 extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
 
 typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
     uid_t rid, uint64_t space);
 
 extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t,
     zfs_userspace_cb_t, void *);
 
 extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **);
 extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *);
 
 typedef struct recvflags {
 	/* print informational messages (ie, -v was specified) */
 	boolean_t verbose;
 
 	/* the destination is a prefix, not the exact fs (ie, -d) */
 	boolean_t isprefix;
 
 	/*
 	 * Only the tail of the sent snapshot path is appended to the
 	 * destination to determine the received snapshot name (ie, -e).
 	 */
 	boolean_t istail;
 
 	/* do not actually do the recv, just check if it would work (ie, -n) */
 	boolean_t dryrun;
 
 	/* rollback/destroy filesystems as necessary (eg, -F) */
 	boolean_t force;
 
 	/* set "canmount=off" on all modified filesystems */
 	boolean_t canmountoff;
 
 	/*
 	 * Mark the file systems as "resumable" and do not destroy them if the
 	 * receive is interrupted
 	 */
 	boolean_t resumable;
 
 	/* byteswap flag is used internally; callers need not specify */
 	boolean_t byteswap;
 
 	/* do not mount file systems as they are extracted (private) */
 	boolean_t nomount;
 } recvflags_t;
 
 extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *,
     recvflags_t *, int, avl_tree_t *);
 
 typedef enum diff_flags {
 	ZFS_DIFF_PARSEABLE = 0x1,
 	ZFS_DIFF_TIMESTAMP = 0x2,
 	ZFS_DIFF_CLASSIFY = 0x4
 } diff_flags_t;
 
 extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *,
     int);
 
 /*
  * Miscellaneous functions.
  */
 extern const char *zfs_type_to_name(zfs_type_t);
 extern void zfs_refresh_properties(zfs_handle_t *);
 extern int zfs_name_valid(const char *, zfs_type_t);
 extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t);
 extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *,
     zfs_type_t);
 extern int zfs_spa_version(zfs_handle_t *, int *);
 extern boolean_t zfs_bookmark_exists(const char *path);
 
 /*
  * Mount support functions.
  */
 extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **);
 extern boolean_t zfs_is_mounted(zfs_handle_t *, char **);
 extern int zfs_mount(zfs_handle_t *, const char *, int);
 extern int zfs_unmount(zfs_handle_t *, const char *, int);
 extern int zfs_unmountall(zfs_handle_t *, int);
 
 /*
  * Share support functions.
  */
 extern boolean_t zfs_is_shared(zfs_handle_t *);
 extern int zfs_share(zfs_handle_t *);
 extern int zfs_unshare(zfs_handle_t *);
 
 /*
  * Protocol-specific share support functions.
  */
 extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **);
 extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **);
 extern int zfs_share_nfs(zfs_handle_t *);
 extern int zfs_share_smb(zfs_handle_t *);
 extern int zfs_shareall(zfs_handle_t *);
 extern int zfs_unshare_nfs(zfs_handle_t *, const char *);
 extern int zfs_unshare_smb(zfs_handle_t *, const char *);
 extern int zfs_unshareall_nfs(zfs_handle_t *);
 extern int zfs_unshareall_smb(zfs_handle_t *);
 extern int zfs_unshareall_bypath(zfs_handle_t *, const char *);
 extern int zfs_unshareall(zfs_handle_t *);
 extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
     void *, void *, int, zfs_share_op_t);
 
 /*
  * FreeBSD-specific jail support function.
  */
 extern int zfs_jail(zfs_handle_t *, int, int);
 
 /*
  * When dealing with nvlists, verify() is extremely useful
  */
 #ifndef verify
 #ifdef NDEBUG
 #define	verify(EX)	((void)(EX))
 #else
 #define	verify(EX)	assert(EX)
 #endif
 #endif
 
 /*
  * Utility function to convert a number to a human-readable form.
  */
 extern void zfs_nicenum(uint64_t, char *, size_t);
 extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *);
 
 /*
  * Given a device or file, determine if it is part of a pool.
  */
 extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
     boolean_t *);
 
 /*
  * Label manipulation.
  */
 extern int zpool_read_label(int, nvlist_t **);
 extern int zpool_clear_label(int);
 
 /* is this zvol valid for use as a dump device? */
 extern int zvol_check_dump_config(char *);
 
 /*
  * Management interfaces for SMB ACL files
  */
 
 int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *);
 int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *);
 int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *);
 int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *);
 
 /*
  * Enable and disable datasets within a pool by mounting/unmounting and
  * sharing/unsharing them.
  */
 extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
 extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
 
 /*
  * Mappings between vdev and FRU.
  */
 extern void libzfs_fru_refresh(libzfs_handle_t *);
 extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *);
 extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *);
 extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *,
     const char *);
 extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *);
 extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *);
 
 #ifndef illumos
 extern int zmount(const char *, const char *, int, char *, char *, int, char *,
     int);
 #endif
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _LIBZFS_H */
Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	(revision 296518)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	(revision 296519)
@@ -1,4896 +1,4897 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2012 DEY Storage Systems, Inc.  All rights reserved.
  * Copyright (c) 2011-2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <ctype.h>
 #include <errno.h>
 #include <libintl.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <stddef.h>
 #include <zone.h>
 #include <fcntl.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <priv.h>
 #include <pwd.h>
 #include <grp.h>
 #include <stddef.h>
 #include <idmap.h>
 
 #include <sys/dnode.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/misc.h>
 #include <libzfs.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
 #include "zfs_deleg.h"
 
 static int userquota_propname_decode(const char *propname, boolean_t zoned,
     zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp);
 
 /*
  * Given a single type (not a mask of types), return the type in a human
  * readable form.
  */
 const char *
 zfs_type_to_name(zfs_type_t type)
 {
 	switch (type) {
 	case ZFS_TYPE_FILESYSTEM:
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 	case ZFS_TYPE_SNAPSHOT:
 		return (dgettext(TEXT_DOMAIN, "snapshot"));
 	case ZFS_TYPE_VOLUME:
 		return (dgettext(TEXT_DOMAIN, "volume"));
 	}
 
 	return (NULL);
 }
 
 /*
  * Given a path and mask of ZFS types, return a string describing this dataset.
  * This is used when we fail to open a dataset and we cannot get an exact type.
  * We guess what the type would have been based on the path and the mask of
  * acceptable types.
  */
 static const char *
 path_to_str(const char *path, int types)
 {
 	/*
 	 * When given a single type, always report the exact type.
 	 */
 	if (types == ZFS_TYPE_SNAPSHOT)
 		return (dgettext(TEXT_DOMAIN, "snapshot"));
 	if (types == ZFS_TYPE_FILESYSTEM)
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 	if (types == ZFS_TYPE_VOLUME)
 		return (dgettext(TEXT_DOMAIN, "volume"));
 
 	/*
 	 * The user is requesting more than one type of dataset.  If this is the
 	 * case, consult the path itself.  If we're looking for a snapshot, and
 	 * a '@' is found, then report it as "snapshot".  Otherwise, remove the
 	 * snapshot attribute and try again.
 	 */
 	if (types & ZFS_TYPE_SNAPSHOT) {
 		if (strchr(path, '@') != NULL)
 			return (dgettext(TEXT_DOMAIN, "snapshot"));
 		return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT));
 	}
 
 	/*
 	 * The user has requested either filesystems or volumes.
 	 * We have no way of knowing a priori what type this would be, so always
 	 * report it as "filesystem" or "volume", our two primitive types.
 	 */
 	if (types & ZFS_TYPE_FILESYSTEM)
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 
 	assert(types & ZFS_TYPE_VOLUME);
 	return (dgettext(TEXT_DOMAIN, "volume"));
 }
 
 /*
  * Validate a ZFS path.  This is used even before trying to open the dataset, to
  * provide a more meaningful error message.  We call zfs_error_aux() to
  * explain exactly why the name was not valid.
  */
 int
 zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
     boolean_t modifying)
 {
 	namecheck_err_t why;
 	char what;
 
 	(void) zfs_prop_get_table();
 	if (dataset_namecheck(path, &why, &what) != 0) {
 		if (hdl != NULL) {
 			switch (why) {
 			case NAME_ERR_TOOLONG:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "name is too long"));
 				break;
 
 			case NAME_ERR_LEADING_SLASH:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "leading slash in name"));
 				break;
 
 			case NAME_ERR_EMPTY_COMPONENT:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "empty component in name"));
 				break;
 
 			case NAME_ERR_TRAILING_SLASH:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "trailing slash in name"));
 				break;
 
 			case NAME_ERR_INVALCHAR:
 				zfs_error_aux(hdl,
 				    dgettext(TEXT_DOMAIN, "invalid character "
 				    "'%c' in name"), what);
 				break;
 
 			case NAME_ERR_MULTIPLE_AT:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "multiple '@' delimiters in name"));
 				break;
 
 			case NAME_ERR_NOLETTER:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "pool doesn't begin with a letter"));
 				break;
 
 			case NAME_ERR_RESERVED:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "name is reserved"));
 				break;
 
 			case NAME_ERR_DISKLIKE:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "reserved disk name"));
 				break;
 			}
 		}
 
 		return (0);
 	}
 
 	if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "snapshot delimiter '@' in filesystem name"));
 		return (0);
 	}
 
 	if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "missing '@' delimiter in snapshot name"));
 		return (0);
 	}
 
 	if (modifying && strchr(path, '%') != NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid character %c in name"), '%');
 		return (0);
 	}
 
 	return (-1);
 }
 
 int
 zfs_name_valid(const char *name, zfs_type_t type)
 {
 	if (type == ZFS_TYPE_POOL)
 		return (zpool_name_valid(NULL, B_FALSE, name));
 	return (zfs_validate_name(NULL, name, type, B_FALSE));
 }
 
 /*
  * This function takes the raw DSL properties, and filters out the user-defined
  * properties into a separate nvlist.
  */
 static nvlist_t *
 process_user_props(zfs_handle_t *zhp, nvlist_t *props)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvpair_t *elem;
 	nvlist_t *propval;
 	nvlist_t *nvl;
 
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
 		(void) no_memory(hdl);
 		return (NULL);
 	}
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		if (!zfs_prop_user(nvpair_name(elem)))
 			continue;
 
 		verify(nvpair_value_nvlist(elem, &propval) == 0);
 		if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) {
 			nvlist_free(nvl);
 			(void) no_memory(hdl);
 			return (NULL);
 		}
 	}
 
 	return (nvl);
 }
 
 static zpool_handle_t *
 zpool_add_handle(zfs_handle_t *zhp, const char *pool_name)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zpool_handle_t *zph;
 
 	if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) {
 		if (hdl->libzfs_pool_handles != NULL)
 			zph->zpool_next = hdl->libzfs_pool_handles;
 		hdl->libzfs_pool_handles = zph;
 	}
 	return (zph);
 }
 
 static zpool_handle_t *
 zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zpool_handle_t *zph = hdl->libzfs_pool_handles;
 
 	while ((zph != NULL) &&
 	    (strncmp(pool_name, zpool_get_name(zph), len) != 0))
 		zph = zph->zpool_next;
 	return (zph);
 }
 
 /*
  * Returns a handle to the pool that contains the provided dataset.
  * If a handle to that pool already exists then that handle is returned.
  * Otherwise, a new handle is created and added to the list of handles.
  */
 static zpool_handle_t *
 zpool_handle(zfs_handle_t *zhp)
 {
 	char *pool_name;
 	int len;
 	zpool_handle_t *zph;
 
 	len = strcspn(zhp->zfs_name, "/@#") + 1;
 	pool_name = zfs_alloc(zhp->zfs_hdl, len);
 	(void) strlcpy(pool_name, zhp->zfs_name, len);
 
 	zph = zpool_find_handle(zhp, pool_name, len);
 	if (zph == NULL)
 		zph = zpool_add_handle(zhp, pool_name);
 
 	free(pool_name);
 	return (zph);
 }
 
 void
 zpool_free_handles(libzfs_handle_t *hdl)
 {
 	zpool_handle_t *next, *zph = hdl->libzfs_pool_handles;
 
 	while (zph != NULL) {
 		next = zph->zpool_next;
 		zpool_close(zph);
 		zph = next;
 	}
 	hdl->libzfs_pool_handles = NULL;
 }
 
 /*
  * Utility function to gather stats (objset and zpl) for the given object.
  */
 static int
 get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 	(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
 
 	while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) {
 		if (errno == ENOMEM) {
 			if (zcmd_expand_dst_nvlist(hdl, zc) != 0) {
 				return (-1);
 			}
 		} else {
 			return (-1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Utility function to get the received properties of the given object.
  */
 static int
 get_recvd_props_ioctl(zfs_handle_t *zhp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *recvdprops;
 	zfs_cmd_t zc = { 0 };
 	int err;
 
 	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
 		return (-1);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) {
 		if (errno == ENOMEM) {
 			if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
 				return (-1);
 			}
 		} else {
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 	}
 
 	err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops);
 	zcmd_free_nvlists(&zc);
 	if (err != 0)
 		return (-1);
 
 	nvlist_free(zhp->zfs_recvd_props);
 	zhp->zfs_recvd_props = recvdprops;
 
 	return (0);
 }
 
 static int
 put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
 	nvlist_t *allprops, *userprops;
 
 	zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */
 
 	if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) {
 		return (-1);
 	}
 
 	/*
 	 * XXX Why do we store the user props separately, in addition to
 	 * storing them in zfs_props?
 	 */
 	if ((userprops = process_user_props(zhp, allprops)) == NULL) {
 		nvlist_free(allprops);
 		return (-1);
 	}
 
 	nvlist_free(zhp->zfs_props);
 	nvlist_free(zhp->zfs_user_props);
 
 	zhp->zfs_props = allprops;
 	zhp->zfs_user_props = userprops;
 
 	return (0);
 }
 
 static int
 get_stats(zfs_handle_t *zhp)
 {
 	int rc = 0;
 	zfs_cmd_t zc = { 0 };
 
 	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
 		return (-1);
 	if (get_stats_ioctl(zhp, &zc) != 0)
 		rc = -1;
 	else if (put_stats_zhdl(zhp, &zc) != 0)
 		rc = -1;
 	zcmd_free_nvlists(&zc);
 	return (rc);
 }
 
 /*
  * Refresh the properties currently stored in the handle.
  */
 void
 zfs_refresh_properties(zfs_handle_t *zhp)
 {
 	(void) get_stats(zhp);
 }
 
 /*
  * Makes a handle from the given dataset name.  Used by zfs_open() and
  * zfs_iter_* to create child handles on the fly.
  */
 static int
 make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
 	if (put_stats_zhdl(zhp, zc) != 0)
 		return (-1);
 
 	/*
 	 * We've managed to open the dataset and gather statistics.  Determine
 	 * the high-level type.
 	 */
 	if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
 		zhp->zfs_head_type = ZFS_TYPE_VOLUME;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
 		zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM;
 	else
 		abort();
 
 	if (zhp->zfs_dmustats.dds_is_snapshot)
 		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
 		zhp->zfs_type = ZFS_TYPE_VOLUME;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
 		zhp->zfs_type = ZFS_TYPE_FILESYSTEM;
 	else
 		abort();	/* we should never see any other types */
 
 	if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL)
 		return (-1);
 
 	return (0);
 }
 
 zfs_handle_t *
 make_dataset_handle(libzfs_handle_t *hdl, const char *path)
 {
 	zfs_cmd_t zc = { 0 };
 
 	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = hdl;
 	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
 	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) {
 		free(zhp);
 		return (NULL);
 	}
 	if (get_stats_ioctl(zhp, &zc) == -1) {
 		zcmd_free_nvlists(&zc);
 		free(zhp);
 		return (NULL);
 	}
 	if (make_dataset_handle_common(zhp, &zc) == -1) {
 		free(zhp);
 		zhp = NULL;
 	}
 	zcmd_free_nvlists(&zc);
 	return (zhp);
 }
 
 zfs_handle_t *
 make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc)
 {
 	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = hdl;
 	(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
 	if (make_dataset_handle_common(zhp, zc) == -1) {
 		free(zhp);
 		return (NULL);
 	}
 	return (zhp);
 }
 
 zfs_handle_t *
 make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc)
 {
 	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = pzhp->zfs_hdl;
 	(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
 	zhp->zfs_head_type = pzhp->zfs_type;
 	zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
 	zhp->zpool_hdl = zpool_handle(zhp);
 	return (zhp);
 }
 
 zfs_handle_t *
 zfs_handle_dup(zfs_handle_t *zhp_orig)
 {
 	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = zhp_orig->zfs_hdl;
 	zhp->zpool_hdl = zhp_orig->zpool_hdl;
 	(void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name,
 	    sizeof (zhp->zfs_name));
 	zhp->zfs_type = zhp_orig->zfs_type;
 	zhp->zfs_head_type = zhp_orig->zfs_head_type;
 	zhp->zfs_dmustats = zhp_orig->zfs_dmustats;
 	if (zhp_orig->zfs_props != NULL) {
 		if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) {
 			(void) no_memory(zhp->zfs_hdl);
 			zfs_close(zhp);
 			return (NULL);
 		}
 	}
 	if (zhp_orig->zfs_user_props != NULL) {
 		if (nvlist_dup(zhp_orig->zfs_user_props,
 		    &zhp->zfs_user_props, 0) != 0) {
 			(void) no_memory(zhp->zfs_hdl);
 			zfs_close(zhp);
 			return (NULL);
 		}
 	}
 	if (zhp_orig->zfs_recvd_props != NULL) {
 		if (nvlist_dup(zhp_orig->zfs_recvd_props,
 		    &zhp->zfs_recvd_props, 0)) {
 			(void) no_memory(zhp->zfs_hdl);
 			zfs_close(zhp);
 			return (NULL);
 		}
 	}
 	zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck;
 	if (zhp_orig->zfs_mntopts != NULL) {
 		zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl,
 		    zhp_orig->zfs_mntopts);
 	}
 	zhp->zfs_props_table = zhp_orig->zfs_props_table;
 	return (zhp);
 }
 
 boolean_t
 zfs_bookmark_exists(const char *path)
 {
 	nvlist_t *bmarks;
 	nvlist_t *props;
 	char fsname[ZFS_MAXNAMELEN];
 	char *bmark_name;
 	char *pound;
 	int err;
 	boolean_t rv;
 
 
 	(void) strlcpy(fsname, path, sizeof (fsname));
 	pound = strchr(fsname, '#');
 	if (pound == NULL)
 		return (B_FALSE);
 
 	*pound = '\0';
 	bmark_name = pound + 1;
 	props = fnvlist_alloc();
 	err = lzc_get_bookmarks(fsname, props, &bmarks);
 	nvlist_free(props);
 	if (err != 0) {
 		nvlist_free(bmarks);
 		return (B_FALSE);
 	}
 
 	rv = nvlist_exists(bmarks, bmark_name);
 	nvlist_free(bmarks);
 	return (rv);
 }
 
 zfs_handle_t *
 make_bookmark_handle(zfs_handle_t *parent, const char *path,
     nvlist_t *bmark_props)
 {
 	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
 
 	if (zhp == NULL)
 		return (NULL);
 
 	/* Fill in the name. */
 	zhp->zfs_hdl = parent->zfs_hdl;
 	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
 
 	/* Set the property lists. */
 	if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) {
 		free(zhp);
 		return (NULL);
 	}
 
 	/* Set the types. */
 	zhp->zfs_head_type = parent->zfs_head_type;
 	zhp->zfs_type = ZFS_TYPE_BOOKMARK;
 
 	if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) {
 		nvlist_free(zhp->zfs_props);
 		free(zhp);
 		return (NULL);
 	}
 
 	return (zhp);
 }
 
 /*
  * Opens the given snapshot, filesystem, or volume.   The 'types'
  * argument is a mask of acceptable types.  The function will print an
  * appropriate error message and return NULL if it can't be opened.
  */
 zfs_handle_t *
 zfs_open(libzfs_handle_t *hdl, const char *path, int types)
 {
 	zfs_handle_t *zhp;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
 
 	/*
 	 * Validate the name before we even try to open it.
 	 */
 	if (!zfs_validate_name(hdl, path, ZFS_TYPE_DATASET, B_FALSE)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid dataset name"));
 		(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 		return (NULL);
 	}
 
 	/*
 	 * Try to get stats for the dataset, which will tell us if it exists.
 	 */
 	errno = 0;
 	if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
 		(void) zfs_standard_error(hdl, errno, errbuf);
 		return (NULL);
 	}
 
 	if (zhp == NULL) {
 		char *at = strchr(path, '@');
 
 		if (at != NULL)
 			*at = '\0';
 		errno = 0;
 		if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
 			(void) zfs_standard_error(hdl, errno, errbuf);
 			return (NULL);
 		}
 		if (at != NULL)
 			*at = '@';
 		(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
 		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
 	}
 
 	if (!(types & zhp->zfs_type)) {
 		(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 		zfs_close(zhp);
 		return (NULL);
 	}
 
 	return (zhp);
 }
 
 /*
  * Release a ZFS handle.  Nothing to do but free the associated memory.
  */
 void
 zfs_close(zfs_handle_t *zhp)
 {
 	if (zhp->zfs_mntopts)
 		free(zhp->zfs_mntopts);
 	nvlist_free(zhp->zfs_props);
 	nvlist_free(zhp->zfs_user_props);
 	nvlist_free(zhp->zfs_recvd_props);
 	free(zhp);
 }
 
 typedef struct mnttab_node {
 	struct mnttab mtn_mt;
 	avl_node_t mtn_node;
 } mnttab_node_t;
 
 static int
 libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)
 {
 	const mnttab_node_t *mtn1 = arg1;
 	const mnttab_node_t *mtn2 = arg2;
 	int rv;
 
 	rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);
 
 	if (rv == 0)
 		return (0);
 	return (rv > 0 ? 1 : -1);
 }
 
 void
 libzfs_mnttab_init(libzfs_handle_t *hdl)
 {
 	assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
 	avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
 	    sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
 }
 
 void
 libzfs_mnttab_update(libzfs_handle_t *hdl)
 {
 	struct mnttab entry;
 
 	rewind(hdl->libzfs_mnttab);
 	while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
 		mnttab_node_t *mtn;
 
 		if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
 			continue;
 		mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
 		mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special);
 		mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp);
 		mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype);
 		mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts);
 		avl_add(&hdl->libzfs_mnttab_cache, mtn);
 	}
 }
 
 void
 libzfs_mnttab_fini(libzfs_handle_t *hdl)
 {
 	void *cookie = NULL;
 	mnttab_node_t *mtn;
 
 	while (mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) {
 		free(mtn->mtn_mt.mnt_special);
 		free(mtn->mtn_mt.mnt_mountp);
 		free(mtn->mtn_mt.mnt_fstype);
 		free(mtn->mtn_mt.mnt_mntopts);
 		free(mtn);
 	}
 	avl_destroy(&hdl->libzfs_mnttab_cache);
 }
 
 void
 libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable)
 {
 	hdl->libzfs_mnttab_enable = enable;
 }
 
 int
 libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
     struct mnttab *entry)
 {
 	mnttab_node_t find;
 	mnttab_node_t *mtn;
 
 	if (!hdl->libzfs_mnttab_enable) {
 		struct mnttab srch = { 0 };
 
 		if (avl_numnodes(&hdl->libzfs_mnttab_cache))
 			libzfs_mnttab_fini(hdl);
 		rewind(hdl->libzfs_mnttab);
 		srch.mnt_special = (char *)fsname;
 		srch.mnt_fstype = MNTTYPE_ZFS;
 		if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0)
 			return (0);
 		else
 			return (ENOENT);
 	}
 
 	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
 		libzfs_mnttab_update(hdl);
 
 	find.mtn_mt.mnt_special = (char *)fsname;
 	mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
 	if (mtn) {
 		*entry = mtn->mtn_mt;
 		return (0);
 	}
 	return (ENOENT);
 }
 
 void
 libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special,
     const char *mountp, const char *mntopts)
 {
 	mnttab_node_t *mtn;
 
 	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
 		return;
 	mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
 	mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
 	mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
 	mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
 	mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
 	avl_add(&hdl->libzfs_mnttab_cache, mtn);
 }
 
 void
 libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
 {
 	mnttab_node_t find;
 	mnttab_node_t *ret;
 
 	find.mtn_mt.mnt_special = (char *)fsname;
 	if (ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) {
 		avl_remove(&hdl->libzfs_mnttab_cache, ret);
 		free(ret->mtn_mt.mnt_special);
 		free(ret->mtn_mt.mnt_mountp);
 		free(ret->mtn_mt.mnt_fstype);
 		free(ret->mtn_mt.mnt_mntopts);
 		free(ret);
 	}
 }
 
 int
 zfs_spa_version(zfs_handle_t *zhp, int *spa_version)
 {
 	zpool_handle_t *zpool_handle = zhp->zpool_hdl;
 
 	if (zpool_handle == NULL)
 		return (-1);
 
 	*spa_version = zpool_get_prop_int(zpool_handle,
 	    ZPOOL_PROP_VERSION, NULL);
 	return (0);
 }
 
 /*
  * The choice of reservation property depends on the SPA version.
  */
 static int
 zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop)
 {
 	int spa_version;
 
 	if (zfs_spa_version(zhp, &spa_version) < 0)
 		return (-1);
 
 	if (spa_version >= SPA_VERSION_REFRESERVATION)
 		*resv_prop = ZFS_PROP_REFRESERVATION;
 	else
 		*resv_prop = ZFS_PROP_RESERVATION;
 
 	return (0);
 }
 
 /*
  * Given an nvlist of properties to set, validates that they are correct, and
  * parses any numeric properties (index, boolean, etc) if they are specified as
  * strings.
  */
 nvlist_t *
 zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
     uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl,
     const char *errbuf)
 {
 	nvpair_t *elem;
 	uint64_t intval;
 	char *strval;
 	zfs_prop_t prop;
 	nvlist_t *ret;
 	int chosen_normal = -1;
 	int chosen_utf = -1;
 
 	if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) {
 		(void) no_memory(hdl);
 		return (NULL);
 	}
 
 	/*
 	 * Make sure this property is valid and applies to this type.
 	 */
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 		const char *propname = nvpair_name(elem);
 
 		prop = zfs_name_to_prop(propname);
 		if (prop == ZPROP_INVAL && zfs_prop_user(propname)) {
 			/*
 			 * This is a user property: make sure it's a
 			 * string, and that it's less than ZAP_MAXNAMELEN.
 			 */
 			if (nvpair_type(elem) != DATA_TYPE_STRING) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be a string"), propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "property name '%s' is too long"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			(void) nvpair_value_string(elem, &strval);
 			if (nvlist_add_string(ret, propname, strval) != 0) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 			continue;
 		}
 
 		/*
 		 * Currently, only user properties can be modified on
 		 * snapshots.
 		 */
 		if (type == ZFS_TYPE_SNAPSHOT) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "this property can not be modified for snapshots"));
 			(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
 			goto error;
 		}
 
 		if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) {
 			zfs_userquota_prop_t uqtype;
 			char newpropname[128];
 			char domain[128];
 			uint64_t rid;
 			uint64_t valary[3];
 
 			if (userquota_propname_decode(propname, zoned,
 			    &uqtype, domain, sizeof (domain), &rid) != 0) {
 				zfs_error_aux(hdl,
 				    dgettext(TEXT_DOMAIN,
 				    "'%s' has an invalid user/group name"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			if (uqtype != ZFS_PROP_USERQUOTA &&
 			    uqtype != ZFS_PROP_GROUPQUOTA) {
 				zfs_error_aux(hdl,
 				    dgettext(TEXT_DOMAIN, "'%s' is readonly"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_PROPREADONLY,
 				    errbuf);
 				goto error;
 			}
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				(void) nvpair_value_string(elem, &strval);
 				if (strcmp(strval, "none") == 0) {
 					intval = 0;
 				} else if (zfs_nicestrtonum(hdl,
 				    strval, &intval) != 0) {
 					(void) zfs_error(hdl,
 					    EZFS_BADPROP, errbuf);
 					goto error;
 				}
 			} else if (nvpair_type(elem) ==
 			    DATA_TYPE_UINT64) {
 				(void) nvpair_value_uint64(elem, &intval);
 				if (intval == 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "use 'none' to disable "
 					    "userquota/groupquota"));
 					goto error;
 				}
 			} else {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be a number"), propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			/*
 			 * Encode the prop name as
 			 * userquota@<hex-rid>-domain, to make it easy
 			 * for the kernel to decode.
 			 */
 			(void) snprintf(newpropname, sizeof (newpropname),
 			    "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype],
 			    (longlong_t)rid, domain);
 			valary[0] = uqtype;
 			valary[1] = rid;
 			valary[2] = intval;
 			if (nvlist_add_uint64_array(ret, newpropname,
 			    valary, 3) != 0) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 			continue;
 		} else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' is readonly"),
 			    propname);
 			(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
 			goto error;
 		}
 
 		if (prop == ZPROP_INVAL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property '%s'"), propname);
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 			goto error;
 		}
 
 		if (!zfs_prop_valid_for_type(prop, type)) {
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "'%s' does not "
 			    "apply to datasets of this type"), propname);
 			(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
 			goto error;
 		}
 
 		if (zfs_prop_readonly(prop) &&
 		    (!zfs_prop_setonce(prop) || zhp != NULL)) {
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "'%s' is readonly"),
 			    propname);
 			(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
 			goto error;
 		}
 
 		if (zprop_parse_value(hdl, elem, prop, type, ret,
 		    &strval, &intval, errbuf) != 0)
 			goto error;
 
 		/*
 		 * Perform some additional checks for specific properties.
 		 */
 		switch (prop) {
 		case ZFS_PROP_VERSION:
 		{
 			int version;
 
 			if (zhp == NULL)
 				break;
 			version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
 			if (intval < version) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "Can not downgrade; already at version %u"),
 				    version);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 		}
 
 		case ZFS_PROP_VOLBLOCKSIZE:
 		case ZFS_PROP_RECORDSIZE:
 		{
 			int maxbs = SPA_MAXBLOCKSIZE;
 			if (zpool_hdl != NULL) {
 				maxbs = zpool_get_prop_int(zpool_hdl,
 				    ZPOOL_PROP_MAXBLOCKSIZE, NULL);
 			}
 			/*
 			 * Volumes are limited to a volblocksize of 128KB,
 			 * because they typically service workloads with
 			 * small random writes, which incur a large performance
 			 * penalty with large blocks.
 			 */
 			if (prop == ZFS_PROP_VOLBLOCKSIZE)
 				maxbs = SPA_OLD_MAXBLOCKSIZE;
 			/*
 			 * The value must be a power of two between
 			 * SPA_MINBLOCKSIZE and maxbs.
 			 */
 			if (intval < SPA_MINBLOCKSIZE ||
 			    intval > maxbs || !ISP2(intval)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be power of 2 from 512B "
 				    "to %uKB"), propname, maxbs >> 10);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 		}
 		case ZFS_PROP_MLSLABEL:
 		{
 #ifdef illumos
 			/*
 			 * Verify the mlslabel string and convert to
 			 * internal hex label string.
 			 */
 
 			m_label_t *new_sl;
 			char *hex = NULL;	/* internal label string */
 
 			/* Default value is already OK. */
 			if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
 				break;
 
 			/* Verify the label can be converted to binary form */
 			if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) ||
 			    (str_to_label(strval, &new_sl, MAC_LABEL,
 			    L_NO_CORRECTION, NULL) == -1)) {
 				goto badlabel;
 			}
 
 			/* Now translate to hex internal label string */
 			if (label_to_str(new_sl, &hex, M_INTERNAL,
 			    DEF_NAMES) != 0) {
 				if (hex)
 					free(hex);
 				goto badlabel;
 			}
 			m_label_free(new_sl);
 
 			/* If string is already in internal form, we're done. */
 			if (strcmp(strval, hex) == 0) {
 				free(hex);
 				break;
 			}
 
 			/* Replace the label string with the internal form. */
 			(void) nvlist_remove(ret, zfs_prop_to_name(prop),
 			    DATA_TYPE_STRING);
 			verify(nvlist_add_string(ret, zfs_prop_to_name(prop),
 			    hex) == 0);
 			free(hex);
 
 			break;
 
 badlabel:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid mlslabel '%s'"), strval);
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 			m_label_free(new_sl);	/* OK if null */
 #else	/* !illumos */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "mlslabel is not supported on FreeBSD"));
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 #endif	/* illumos */
 			goto error;
 
 		}
 
 		case ZFS_PROP_MOUNTPOINT:
 		{
 			namecheck_err_t why;
 
 			if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 ||
 			    strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0)
 				break;
 
 			if (mountpoint_namecheck(strval, &why)) {
 				switch (why) {
 				case NAME_ERR_LEADING_SLASH:
 					zfs_error_aux(hdl,
 					    dgettext(TEXT_DOMAIN,
 					    "'%s' must be an absolute path, "
 					    "'none', or 'legacy'"), propname);
 					break;
 				case NAME_ERR_TOOLONG:
 					zfs_error_aux(hdl,
 					    dgettext(TEXT_DOMAIN,
 					    "component of '%s' is too long"),
 					    propname);
 					break;
 				}
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 		}
 
 			/*FALLTHRU*/
 
 		case ZFS_PROP_SHARESMB:
 		case ZFS_PROP_SHARENFS:
 			/*
 			 * For the mountpoint and sharenfs or sharesmb
 			 * properties, check if it can be set in a
 			 * global/non-global zone based on
 			 * the zoned property value:
 			 *
 			 *		global zone	    non-global zone
 			 * --------------------------------------------------
 			 * zoned=on	mountpoint (no)	    mountpoint (yes)
 			 *		sharenfs (no)	    sharenfs (no)
 			 *		sharesmb (no)	    sharesmb (no)
 			 *
 			 * zoned=off	mountpoint (yes)	N/A
 			 *		sharenfs (yes)
 			 *		sharesmb (yes)
 			 */
 			if (zoned) {
 				if (getzoneid() == GLOBAL_ZONEID) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set on "
 					    "dataset in a non-global zone"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_ZONED,
 					    errbuf);
 					goto error;
 				} else if (prop == ZFS_PROP_SHARENFS ||
 				    prop == ZFS_PROP_SHARESMB) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set in "
 					    "a non-global zone"), propname);
 					(void) zfs_error(hdl, EZFS_ZONED,
 					    errbuf);
 					goto error;
 				}
 			} else if (getzoneid() != GLOBAL_ZONEID) {
 				/*
 				 * If zoned property is 'off', this must be in
 				 * a global zone. If not, something is wrong.
 				 */
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' cannot be set while dataset "
 				    "'zoned' property is set"), propname);
 				(void) zfs_error(hdl, EZFS_ZONED, errbuf);
 				goto error;
 			}
 
 			/*
 			 * At this point, it is legitimate to set the
 			 * property. Now we want to make sure that the
 			 * property value is valid if it is sharenfs.
 			 */
 			if ((prop == ZFS_PROP_SHARENFS ||
 			    prop == ZFS_PROP_SHARESMB) &&
 			    strcmp(strval, "on") != 0 &&
 			    strcmp(strval, "off") != 0) {
 				zfs_share_proto_t proto;
 
 				if (prop == ZFS_PROP_SHARESMB)
 					proto = PROTO_SMB;
 				else
 					proto = PROTO_NFS;
 
 				/*
 				 * Must be an valid sharing protocol
 				 * option string so init the libshare
 				 * in order to enable the parser and
 				 * then parse the options. We use the
 				 * control API since we don't care about
 				 * the current configuration and don't
 				 * want the overhead of loading it
 				 * until we actually do something.
 				 */
 
 				if (zfs_init_libshare(hdl,
 				    SA_INIT_CONTROL_API) != SA_OK) {
 					/*
 					 * An error occurred so we can't do
 					 * anything
 					 */
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set: problem "
 					    "in share initialization"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 
 				if (zfs_parse_options(strval, proto) != SA_OK) {
 					/*
 					 * There was an error in parsing so
 					 * deal with it by issuing an error
 					 * message and leaving after
 					 * uninitializing the the libshare
 					 * interface.
 					 */
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set to invalid "
 					    "options"), propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					zfs_uninit_libshare(hdl);
 					goto error;
 				}
 				zfs_uninit_libshare(hdl);
 			}
 
 			break;
 		case ZFS_PROP_UTF8ONLY:
 			chosen_utf = (int)intval;
 			break;
 		case ZFS_PROP_NORMALIZE:
 			chosen_normal = (int)intval;
 			break;
 		}
 
 		/*
 		 * For changes to existing volumes, we have some additional
 		 * checks to enforce.
 		 */
 		if (type == ZFS_TYPE_VOLUME && zhp != NULL) {
 			uint64_t volsize = zfs_prop_get_int(zhp,
 			    ZFS_PROP_VOLSIZE);
 			uint64_t blocksize = zfs_prop_get_int(zhp,
 			    ZFS_PROP_VOLBLOCKSIZE);
 			char buf[64];
 
 			switch (prop) {
 			case ZFS_PROP_RESERVATION:
 			case ZFS_PROP_REFRESERVATION:
 				if (intval > volsize) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' is greater than current "
 					    "volume size"), propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 				break;
 
 			case ZFS_PROP_VOLSIZE:
 				if (intval % blocksize != 0) {
 					zfs_nicenum(blocksize, buf,
 					    sizeof (buf));
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' must be a multiple of "
 					    "volume block size (%s)"),
 					    propname, buf);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 
 				if (intval == 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be zero"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If normalization was chosen, but no UTF8 choice was made,
 	 * enforce rejection of non-UTF8 names.
 	 *
 	 * If normalization was chosen, but rejecting non-UTF8 names
 	 * was explicitly not chosen, it is an error.
 	 */
 	if (chosen_normal > 0 && chosen_utf < 0) {
 		if (nvlist_add_uint64(ret,
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) {
 			(void) no_memory(hdl);
 			goto error;
 		}
 	} else if (chosen_normal > 0 && chosen_utf == 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be set 'on' if normalization chosen"),
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
 		(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		goto error;
 	}
 	return (ret);
 
 error:
 	nvlist_free(ret);
 	return (NULL);
 }
 
 int
 zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
 {
 	uint64_t old_volsize;
 	uint64_t new_volsize;
 	uint64_t old_reservation;
 	uint64_t new_reservation;
 	zfs_prop_t resv_prop;
 	nvlist_t *props;
 
 	/*
 	 * If this is an existing volume, and someone is setting the volsize,
 	 * make sure that it matches the reservation, or add it if necessary.
 	 */
 	old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 	if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
 		return (-1);
 	old_reservation = zfs_prop_get_int(zhp, resv_prop);
 
 	props = fnvlist_alloc();
 	fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 	    zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
 
 	if ((zvol_volsize_to_reservation(old_volsize, props) !=
 	    old_reservation) || nvlist_exists(nvl,
 	    zfs_prop_to_name(resv_prop))) {
 		fnvlist_free(props);
 		return (0);
 	}
 	if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 	    &new_volsize) != 0) {
 		fnvlist_free(props);
 		return (-1);
 	}
 	new_reservation = zvol_volsize_to_reservation(new_volsize, props);
 	fnvlist_free(props);
 
 	if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
 	    new_reservation) != 0) {
 		(void) no_memory(zhp->zfs_hdl);
 		return (-1);
 	}
 	return (1);
 }
 
 void
 zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
     char *errbuf)
 {
 	switch (err) {
 
 	case ENOSPC:
 		/*
 		 * For quotas and reservations, ENOSPC indicates
 		 * something different; setting a quota or reservation
 		 * doesn't use any disk space.
 		 */
 		switch (prop) {
 		case ZFS_PROP_QUOTA:
 		case ZFS_PROP_REFQUOTA:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "size is less than current used or "
 			    "reserved space"));
 			(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 			break;
 
 		case ZFS_PROP_RESERVATION:
 		case ZFS_PROP_REFRESERVATION:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "size is greater than available space"));
 			(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 			break;
 
 		default:
 			(void) zfs_standard_error(hdl, err, errbuf);
 			break;
 		}
 		break;
 
 	case EBUSY:
 		(void) zfs_standard_error(hdl, EBUSY, errbuf);
 		break;
 
 	case EROFS:
 		(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
 		break;
 
 	case E2BIG:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "property value too long"));
 		(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		break;
 
 	case ENOTSUP:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool and or dataset must be upgraded to set this "
 		    "property or value"));
 		(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 		break;
 
 	case ERANGE:
 	case EDOM:
 		if (prop == ZFS_PROP_COMPRESSION ||
 		    prop == ZFS_PROP_RECORDSIZE) {
 			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property setting is not allowed on "
 			    "bootable datasets"));
 			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
 		} else if (prop == ZFS_PROP_CHECKSUM ||
 		    prop == ZFS_PROP_DEDUP) {
 			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property setting is not allowed on "
 			    "root pools"));
 			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
 		break;
 
 	case EINVAL:
 		if (prop == ZPROP_INVAL) {
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
 		break;
 
 	case EOVERFLOW:
 		/*
 		 * This platform can't address a volume this big.
 		 */
 #ifdef _ILP32
 		if (prop == ZFS_PROP_VOLSIZE) {
 			(void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
 			break;
 		}
 #endif
 		/* FALLTHROUGH */
 	default:
 		(void) zfs_standard_error(hdl, err, errbuf);
 	}
 }
 
 /*
  * Given a property name and value, set the property for the given dataset.
  */
 int
 zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 {
 	int ret = -1;
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *nvl = NULL;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
 	    zhp->zfs_name);
 
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 ||
 	    nvlist_add_string(nvl, propname, propval) != 0) {
 		(void) no_memory(hdl);
 		goto error;
 	}
 
 	ret = zfs_prop_set_list(zhp, nvl);
 
 error:
 	nvlist_free(nvl);
 	return (ret);
 }
 
 
 
 /*
  * Given an nvlist of property names and values, set the properties for the
  * given dataset.
  */
 int
 zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret = -1;
 	prop_changelist_t **cls = NULL;
 	int cl_idx;
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *nvl;
 	int nvl_len;
 	int added_resv;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
 	    zhp->zfs_name);
 
 	if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props,
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl,
 	    errbuf)) == NULL)
 		goto error;
 
 	/*
 	 * We have to check for any extra properties which need to be added
 	 * before computing the length of the nvlist.
 	 */
 	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(nvl, elem)) {
 		if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE &&
 		    (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) {
 			goto error;
 		}
 	}
 	/*
 	 * Check how many properties we're setting and allocate an array to
 	 * store changelist pointers for postfix().
 	 */
 	nvl_len = 0;
 	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(nvl, elem))
 		nvl_len++;
 	if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL)
 		goto error;
 
 	cl_idx = 0;
 	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(nvl, elem)) {
 
 		zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));
 
 		assert(cl_idx < nvl_len);
 		/*
 		 * We don't want to unmount & remount the dataset when changing
 		 * its canmount property to 'on' or 'noauto'.  We only use
 		 * the changelist logic to unmount when setting canmount=off.
 		 */
 		if (!(prop == ZFS_PROP_CANMOUNT &&
 		    fnvpair_value_uint64(elem) != ZFS_CANMOUNT_OFF)) {
 			cls[cl_idx] = changelist_gather(zhp, prop, 0, 0);
 			if (cls[cl_idx] == NULL)
 				goto error;
 		}
 
 		if (prop == ZFS_PROP_MOUNTPOINT &&
 		    changelist_haszonedchild(cls[cl_idx])) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "child dataset with inherited mountpoint is used "
 			    "in a non-global zone"));
 			ret = zfs_error(hdl, EZFS_ZONED, errbuf);
 			goto error;
 		}
 
 		/* We don't support those properties on FreeBSD. */
 		switch (prop) {
 		case ZFS_PROP_DEVICES:
 		case ZFS_PROP_ISCSIOPTIONS:
 		case ZFS_PROP_XATTR:
 		case ZFS_PROP_VSCAN:
 		case ZFS_PROP_NBMAND:
 		case ZFS_PROP_MLSLABEL:
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    "property '%s' not supported on FreeBSD",
 			    nvpair_name(elem));
 			ret = zfs_error(hdl, EZFS_PERM, errbuf);
 			goto error;
 		}
 
 		if (cls[cl_idx] != NULL &&
 		    (ret = changelist_prefix(cls[cl_idx])) != 0)
 			goto error;
 
 		cl_idx++;
 	}
 	assert(cl_idx == nvl_len);
 
 	/*
 	 * Execute the corresponding ioctl() to set this list of properties.
 	 */
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 ||
 	    (ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0)
 		goto error;
 
 	ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
 
 	if (ret != 0) {
 		/* Get the list of unset properties back and report them. */
 		nvlist_t *errorprops = NULL;
 		if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0)
 			goto error;
 		for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
 		    elem != NULL;
 		    elem = nvlist_next_nvpair(nvl, elem)) {
 			zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));
 			zfs_setprop_error(hdl, prop, errno, errbuf);
 		}
 		nvlist_free(errorprops);
 
 		if (added_resv && errno == ENOSPC) {
 			/* clean up the volsize property we tried to set */
 			uint64_t old_volsize = zfs_prop_get_int(zhp,
 			    ZFS_PROP_VOLSIZE);
 			nvlist_free(nvl);
 			nvl = NULL;
 			zcmd_free_nvlists(&zc);
 
 			if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
 				goto error;
 			if (nvlist_add_uint64(nvl,
 			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 			    old_volsize) != 0)
 				goto error;
 			if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0)
 				goto error;
 			(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
 		}
 	} else {
 		for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
 			if (cls[cl_idx] != NULL) {
 				int clp_err = changelist_postfix(cls[cl_idx]);
 				if (clp_err != 0)
 					ret = clp_err;
 			}
 		}
 
 		/*
 		 * Refresh the statistics so the new property value
 		 * is reflected.
 		 */
 		if (ret == 0)
 			(void) get_stats(zhp);
 	}
 
 error:
 	nvlist_free(nvl);
 	zcmd_free_nvlists(&zc);
 	if (cls != NULL) {
 		for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
 			if (cls[cl_idx] != NULL)
 				changelist_free(cls[cl_idx]);
 		}
 		free(cls);
 	}
 	return (ret);
 }
 
 /*
  * Given a property, inherit the value from the parent dataset, or if received
  * is TRUE, revert to the received value, if any.
  */
 int
 zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	prop_changelist_t *cl;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
 	zfs_prop_t prop;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot inherit %s for '%s'"), propname, zhp->zfs_name);
 
 	zc.zc_cookie = received;
 	if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
 		/*
 		 * For user properties, the amount of work we have to do is very
 		 * small, so just do it here.
 		 */
 		if (!zfs_prop_user(propname)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 		(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
 
 		if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0)
 			return (zfs_standard_error(hdl, errno, errbuf));
 
 		return (0);
 	}
 
 	/*
 	 * Verify that this property is inheritable.
 	 */
 	if (zfs_prop_readonly(prop))
 		return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));
 
 	if (!zfs_prop_inheritable(prop) && !received)
 		return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));
 
 	/*
 	 * Check to see if the value applies to this type
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
 		return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));
 
 	/*
 	 * Normalize the name, to get rid of shorthand abbreviations.
 	 */
 	propname = zfs_prop_to_name(prop);
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
 
 	if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID &&
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is used in a non-global zone"));
 		return (zfs_error(hdl, EZFS_ZONED, errbuf));
 	}
 
 	/*
 	 * Determine datasets which will be affected by this change, if any.
 	 */
 	if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL)
 		return (-1);
 
 	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "child dataset with inherited mountpoint is used "
 		    "in a non-global zone"));
 		ret = zfs_error(hdl, EZFS_ZONED, errbuf);
 		goto error;
 	}
 
 	if ((ret = changelist_prefix(cl)) != 0)
 		goto error;
 
 	if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) {
 		return (zfs_standard_error(hdl, errno, errbuf));
 	} else {
 
 		if ((ret = changelist_postfix(cl)) != 0)
 			goto error;
 
 		/*
 		 * Refresh the statistics so the new property is reflected.
 		 */
 		(void) get_stats(zhp);
 	}
 
 error:
 	changelist_free(cl);
 	return (ret);
 }
 
 /*
  * True DSL properties are stored in an nvlist.  The following two functions
  * extract them appropriately.
  */
 static uint64_t
 getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 {
 	nvlist_t *nv;
 	uint64_t value;
 
 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
 		verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
 		(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
 	} else {
 		verify(!zhp->zfs_props_table ||
 		    zhp->zfs_props_table[prop] == B_TRUE);
 		value = zfs_prop_default_numeric(prop);
 		*source = "";
 	}
 
 	return (value);
 }
 
 static const char *
 getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 {
 	nvlist_t *nv;
 	const char *value;
 
 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
 		value = fnvlist_lookup_string(nv, ZPROP_VALUE);
 		(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
 	} else {
 		verify(!zhp->zfs_props_table ||
 		    zhp->zfs_props_table[prop] == B_TRUE);
 		value = zfs_prop_default_string(prop);
 		*source = "";
 	}
 
 	return (value);
 }
 
 static boolean_t
 zfs_is_recvd_props_mode(zfs_handle_t *zhp)
 {
 	return (zhp->zfs_props == zhp->zfs_recvd_props);
 }
 
 static void
 zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
 {
 	*cookie = (uint64_t)(uintptr_t)zhp->zfs_props;
 	zhp->zfs_props = zhp->zfs_recvd_props;
 }
 
 static void
 zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
 {
 	zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie;
 	*cookie = 0;
 }
 
 /*
  * Internal function for getting a numeric property.  Both zfs_prop_get() and
  * zfs_prop_get_int() are built using this interface.
  *
  * Certain properties can be overridden using 'mount -o'.  In this case, scan
  * the contents of the /etc/mnttab entry, searching for the appropriate options.
  * If they differ from the on-disk values, report the current values and mark
  * the source "temporary".
  */
 static int
 get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
     char **source, uint64_t *val)
 {
 	zfs_cmd_t zc = { 0 };
 	nvlist_t *zplprops = NULL;
 	struct mnttab mnt;
 	char *mntopt_on = NULL;
 	char *mntopt_off = NULL;
 	boolean_t received = zfs_is_recvd_props_mode(zhp);
 
 	*source = NULL;
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 		mntopt_on = MNTOPT_ATIME;
 		mntopt_off = MNTOPT_NOATIME;
 		break;
 
 	case ZFS_PROP_DEVICES:
 		mntopt_on = MNTOPT_DEVICES;
 		mntopt_off = MNTOPT_NODEVICES;
 		break;
 
 	case ZFS_PROP_EXEC:
 		mntopt_on = MNTOPT_EXEC;
 		mntopt_off = MNTOPT_NOEXEC;
 		break;
 
 	case ZFS_PROP_READONLY:
 		mntopt_on = MNTOPT_RO;
 		mntopt_off = MNTOPT_RW;
 		break;
 
 	case ZFS_PROP_SETUID:
 		mntopt_on = MNTOPT_SETUID;
 		mntopt_off = MNTOPT_NOSETUID;
 		break;
 
 	case ZFS_PROP_XATTR:
 		mntopt_on = MNTOPT_XATTR;
 		mntopt_off = MNTOPT_NOXATTR;
 		break;
 
 	case ZFS_PROP_NBMAND:
 		mntopt_on = MNTOPT_NBMAND;
 		mntopt_off = MNTOPT_NONBMAND;
 		break;
 	}
 
 	/*
 	 * Because looking up the mount options is potentially expensive
 	 * (iterating over all of /etc/mnttab), we defer its calculation until
 	 * we're looking up a property which requires its presence.
 	 */
 	if (!zhp->zfs_mntcheck &&
 	    (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) {
 		libzfs_handle_t *hdl = zhp->zfs_hdl;
 		struct mnttab entry;
 
 		if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) {
 			zhp->zfs_mntopts = zfs_strdup(hdl,
 			    entry.mnt_mntopts);
 			if (zhp->zfs_mntopts == NULL)
 				return (-1);
 		}
 
 		zhp->zfs_mntcheck = B_TRUE;
 	}
 
 	if (zhp->zfs_mntopts == NULL)
 		mnt.mnt_mntopts = "";
 	else
 		mnt.mnt_mntopts = zhp->zfs_mntopts;
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_EXEC:
 	case ZFS_PROP_READONLY:
 	case ZFS_PROP_SETUID:
 	case ZFS_PROP_XATTR:
 	case ZFS_PROP_NBMAND:
 		*val = getprop_uint64(zhp, prop, source);
 
 		if (received)
 			break;
 
 		if (hasmntopt(&mnt, mntopt_on) && !*val) {
 			*val = B_TRUE;
 			if (src)
 				*src = ZPROP_SRC_TEMPORARY;
 		} else if (hasmntopt(&mnt, mntopt_off) && *val) {
 			*val = B_FALSE;
 			if (src)
 				*src = ZPROP_SRC_TEMPORARY;
 		}
 		break;
 
 	case ZFS_PROP_CANMOUNT:
 	case ZFS_PROP_VOLSIZE:
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_RESERVATION:
 	case ZFS_PROP_REFRESERVATION:
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 	case ZFS_PROP_FILESYSTEM_COUNT:
 	case ZFS_PROP_SNAPSHOT_COUNT:
 		*val = getprop_uint64(zhp, prop, source);
 
 		if (*source == NULL) {
 			/* not default, must be local */
 			*source = zhp->zfs_name;
 		}
 		break;
 
 	case ZFS_PROP_MOUNTED:
 		*val = (zhp->zfs_mntopts != NULL);
 		break;
 
 	case ZFS_PROP_NUMCLONES:
 		*val = zhp->zfs_dmustats.dds_num_clones;
 		break;
 
 	case ZFS_PROP_VERSION:
 	case ZFS_PROP_NORMALIZE:
 	case ZFS_PROP_UTF8ONLY:
 	case ZFS_PROP_CASE:
 		if (!zfs_prop_valid_for_type(prop, zhp->zfs_head_type) ||
 		    zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
 			return (-1);
 		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 		if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) {
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 		if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 ||
 		    nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop),
 		    val) != 0) {
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 		if (zplprops)
 			nvlist_free(zplprops);
 		zcmd_free_nvlists(&zc);
 		break;
 
 	case ZFS_PROP_INCONSISTENT:
 		*val = zhp->zfs_dmustats.dds_inconsistent;
 		break;
 
 	default:
 		switch (zfs_prop_get_type(prop)) {
 		case PROP_TYPE_NUMBER:
 		case PROP_TYPE_INDEX:
 			*val = getprop_uint64(zhp, prop, source);
 			/*
 			 * If we tried to use a default value for a
 			 * readonly property, it means that it was not
 			 * present.
 			 */
 			if (zfs_prop_readonly(prop) &&
 			    *source != NULL && (*source)[0] == '\0') {
 				*source = NULL;
 			}
 			break;
 
 		case PROP_TYPE_STRING:
 		default:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "cannot get non-numeric property"));
 			return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP,
 			    dgettext(TEXT_DOMAIN, "internal error")));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Calculate the source type, given the raw source string.
  */
 static void
 get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source,
     char *statbuf, size_t statlen)
 {
 	if (statbuf == NULL || *srctype == ZPROP_SRC_TEMPORARY)
 		return;
 
 	if (source == NULL) {
 		*srctype = ZPROP_SRC_NONE;
 	} else if (source[0] == '\0') {
 		*srctype = ZPROP_SRC_DEFAULT;
 	} else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) {
 		*srctype = ZPROP_SRC_RECEIVED;
 	} else {
 		if (strcmp(source, zhp->zfs_name) == 0) {
 			*srctype = ZPROP_SRC_LOCAL;
 		} else {
 			(void) strlcpy(statbuf, source, statlen);
 			*srctype = ZPROP_SRC_INHERITED;
 		}
 	}
 
 }
 
 int
 zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf,
     size_t proplen, boolean_t literal)
 {
 	zfs_prop_t prop;
 	int err = 0;
 
 	if (zhp->zfs_recvd_props == NULL)
 		if (get_recvd_props_ioctl(zhp) != 0)
 			return (-1);
 
 	prop = zfs_name_to_prop(propname);
 
 	if (prop != ZPROP_INVAL) {
 		uint64_t cookie;
 		if (!nvlist_exists(zhp->zfs_recvd_props, propname))
 			return (-1);
 		zfs_set_recvd_props_mode(zhp, &cookie);
 		err = zfs_prop_get(zhp, prop, propbuf, proplen,
 		    NULL, NULL, 0, literal);
 		zfs_unset_recvd_props_mode(zhp, &cookie);
 	} else {
 		nvlist_t *propval;
 		char *recvdval;
 		if (nvlist_lookup_nvlist(zhp->zfs_recvd_props,
 		    propname, &propval) != 0)
 			return (-1);
 		verify(nvlist_lookup_string(propval, ZPROP_VALUE,
 		    &recvdval) == 0);
 		(void) strlcpy(propbuf, recvdval, proplen);
 	}
 
 	return (err == 0 ? 0 : -1);
 }
 
 static int
 get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen)
 {
 	nvlist_t *value;
 	nvpair_t *pair;
 
 	value = zfs_get_clones_nvl(zhp);
 	if (value == NULL)
 		return (-1);
 
 	propbuf[0] = '\0';
 	for (pair = nvlist_next_nvpair(value, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(value, pair)) {
 		if (propbuf[0] != '\0')
 			(void) strlcat(propbuf, ",", proplen);
 		(void) strlcat(propbuf, nvpair_name(pair), proplen);
 	}
 
 	return (0);
 }
 
 struct get_clones_arg {
 	uint64_t numclones;
 	nvlist_t *value;
 	const char *origin;
 	char buf[ZFS_MAXNAMELEN];
 };
 
 int
 get_clones_cb(zfs_handle_t *zhp, void *arg)
 {
 	struct get_clones_arg *gca = arg;
 
 	if (gca->numclones == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf),
 	    NULL, NULL, 0, B_TRUE) != 0)
 		goto out;
 	if (strcmp(gca->buf, gca->origin) == 0) {
 		fnvlist_add_boolean(gca->value, zfs_get_name(zhp));
 		gca->numclones--;
 	}
 
 out:
 	(void) zfs_iter_children(zhp, get_clones_cb, gca);
 	zfs_close(zhp);
 	return (0);
 }
 
 nvlist_t *
 zfs_get_clones_nvl(zfs_handle_t *zhp)
 {
 	nvlist_t *nv, *value;
 
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) {
 		struct get_clones_arg gca;
 
 		/*
 		 * if this is a snapshot, then the kernel wasn't able
 		 * to get the clones.  Do it by slowly iterating.
 		 */
 		if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT)
 			return (NULL);
 		if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0)
 			return (NULL);
 		if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) {
 			nvlist_free(nv);
 			return (NULL);
 		}
 
 		gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES);
 		gca.value = value;
 		gca.origin = zhp->zfs_name;
 
 		if (gca.numclones != 0) {
 			zfs_handle_t *root;
 			char pool[ZFS_MAXNAMELEN];
 			char *cp = pool;
 
 			/* get the pool name */
 			(void) strlcpy(pool, zhp->zfs_name, sizeof (pool));
 			(void) strsep(&cp, "/@");
 			root = zfs_open(zhp->zfs_hdl, pool,
 			    ZFS_TYPE_FILESYSTEM);
 
 			(void) get_clones_cb(root, &gca);
 		}
 
 		if (gca.numclones != 0 ||
 		    nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 ||
 		    nvlist_add_nvlist(zhp->zfs_props,
 		    zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) {
 			nvlist_free(nv);
 			nvlist_free(value);
 			return (NULL);
 		}
 		nvlist_free(nv);
 		nvlist_free(value);
 		verify(0 == nvlist_lookup_nvlist(zhp->zfs_props,
 		    zfs_prop_to_name(ZFS_PROP_CLONES), &nv));
 	}
 
 	verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0);
 
 	return (value);
 }
 
 /*
  * Retrieve a property from the given object.  If 'literal' is specified, then
  * numbers are left as exact values.  Otherwise, numbers are converted to a
  * human-readable form.
  *
  * Returns 0 on success, or -1 on error.
  */
 int
 zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
     zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal)
 {
 	char *source = NULL;
 	uint64_t val;
 	const char *str;
 	const char *strval;
 	boolean_t received = zfs_is_recvd_props_mode(zhp);
 
 	/*
 	 * Check to see if this property applies to our object
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
 		return (-1);
 
 	if (received && zfs_prop_readonly(prop))
 		return (-1);
 
 	if (src)
 		*src = ZPROP_SRC_NONE;
 
 	switch (prop) {
 	case ZFS_PROP_CREATION:
 		/*
 		 * 'creation' is a time_t stored in the statistics.  We convert
 		 * this into a string unless 'literal' is specified.
 		 */
 		{
 			val = getprop_uint64(zhp, prop, &source);
 			time_t time = (time_t)val;
 			struct tm t;
 
 			if (literal ||
 			    localtime_r(&time, &t) == NULL ||
 			    strftime(propbuf, proplen, "%a %b %e %k:%M %Y",
 			    &t) == 0)
 				(void) snprintf(propbuf, proplen, "%llu", val);
 		}
 		break;
 
 	case ZFS_PROP_MOUNTPOINT:
 		/*
 		 * Getting the precise mountpoint can be tricky.
 		 *
 		 *  - for 'none' or 'legacy', return those values.
 		 *  - for inherited mountpoints, we want to take everything
 		 *    after our ancestor and append it to the inherited value.
 		 *
 		 * If the pool has an alternate root, we want to prepend that
 		 * root to any values we return.
 		 */
 
 		str = getprop_string(zhp, prop, &source);
 
 		if (str[0] == '/') {
 			char buf[MAXPATHLEN];
 			char *root = buf;
 			const char *relpath;
 
 			/*
 			 * If we inherit the mountpoint, even from a dataset
 			 * with a received value, the source will be the path of
 			 * the dataset we inherit from. If source is
 			 * ZPROP_SOURCE_VAL_RECVD, the received value is not
 			 * inherited.
 			 */
 			if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
 				relpath = "";
 			} else {
 				relpath = zhp->zfs_name + strlen(source);
 				if (relpath[0] == '/')
 					relpath++;
 			}
 
 			if ((zpool_get_prop(zhp->zpool_hdl,
 			    ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL,
 			    B_FALSE)) || (strcmp(root, "-") == 0))
 				root[0] = '\0';
 			/*
 			 * Special case an alternate root of '/'. This will
 			 * avoid having multiple leading slashes in the
 			 * mountpoint path.
 			 */
 			if (strcmp(root, "/") == 0)
 				root++;
 
 			/*
 			 * If the mountpoint is '/' then skip over this
 			 * if we are obtaining either an alternate root or
 			 * an inherited mountpoint.
 			 */
 			if (str[1] == '\0' && (root[0] != '\0' ||
 			    relpath[0] != '\0'))
 				str++;
 
 			if (relpath[0] == '\0')
 				(void) snprintf(propbuf, proplen, "%s%s",
 				    root, str);
 			else
 				(void) snprintf(propbuf, proplen, "%s%s%s%s",
 				    root, str, relpath[0] == '@' ? "" : "/",
 				    relpath);
 		} else {
 			/* 'legacy' or 'none' */
 			(void) strlcpy(propbuf, str, proplen);
 		}
 
 		break;
 
 	case ZFS_PROP_ORIGIN:
 		str = getprop_string(zhp, prop, &source);
 		if (str == NULL)
 			return (-1);
 		(void) strlcpy(propbuf, str, proplen);
 		break;
 
 	case ZFS_PROP_CLONES:
 		if (get_clones_string(zhp, propbuf, proplen) != 0)
 			return (-1);
 		break;
 
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_RESERVATION:
 	case ZFS_PROP_REFRESERVATION:
 
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 
 		/*
 		 * If quota or reservation is 0, we translate this into 'none'
 		 * (unless literal is set), and indicate that it's the default
 		 * value.  Otherwise, we print the number nicely and indicate
 		 * that its set locally.
 		 */
 		if (val == 0) {
 			if (literal)
 				(void) strlcpy(propbuf, "0", proplen);
 			else
 				(void) strlcpy(propbuf, "none", proplen);
 		} else {
 			if (literal)
 				(void) snprintf(propbuf, proplen, "%llu",
 				    (u_longlong_t)val);
 			else
 				zfs_nicenum(val, propbuf, proplen);
 		}
 		break;
 
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 	case ZFS_PROP_FILESYSTEM_COUNT:
 	case ZFS_PROP_SNAPSHOT_COUNT:
 
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 
 		/*
 		 * If limit is UINT64_MAX, we translate this into 'none' (unless
 		 * literal is set), and indicate that it's the default value.
 		 * Otherwise, we print the number nicely and indicate that it's
 		 * set locally.
 		 */
 		if (literal) {
 			(void) snprintf(propbuf, proplen, "%llu",
 			    (u_longlong_t)val);
 		} else if (val == UINT64_MAX) {
 			(void) strlcpy(propbuf, "none", proplen);
 		} else {
 			zfs_nicenum(val, propbuf, proplen);
 		}
 		break;
 
 	case ZFS_PROP_REFRATIO:
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		(void) snprintf(propbuf, proplen, "%llu.%02llux",
 		    (u_longlong_t)(val / 100),
 		    (u_longlong_t)(val % 100));
 		break;
 
 	case ZFS_PROP_TYPE:
 		switch (zhp->zfs_type) {
 		case ZFS_TYPE_FILESYSTEM:
 			str = "filesystem";
 			break;
 		case ZFS_TYPE_VOLUME:
 			str = "volume";
 			break;
 		case ZFS_TYPE_SNAPSHOT:
 			str = "snapshot";
 			break;
 		case ZFS_TYPE_BOOKMARK:
 			str = "bookmark";
 			break;
 		default:
 			abort();
 		}
 		(void) snprintf(propbuf, proplen, "%s", str);
 		break;
 
 	case ZFS_PROP_MOUNTED:
 		/*
 		 * The 'mounted' property is a pseudo-property that described
 		 * whether the filesystem is currently mounted.  Even though
 		 * it's a boolean value, the typical values of "on" and "off"
 		 * don't make sense, so we translate to "yes" and "no".
 		 */
 		if (get_numeric_property(zhp, ZFS_PROP_MOUNTED,
 		    src, &source, &val) != 0)
 			return (-1);
 		if (val)
 			(void) strlcpy(propbuf, "yes", proplen);
 		else
 			(void) strlcpy(propbuf, "no", proplen);
 		break;
 
 	case ZFS_PROP_NAME:
 		/*
 		 * The 'name' property is a pseudo-property derived from the
 		 * dataset name.  It is presented as a real property to simplify
 		 * consumers.
 		 */
 		(void) strlcpy(propbuf, zhp->zfs_name, proplen);
 		break;
 
 	case ZFS_PROP_MLSLABEL:
 		{
 #ifdef illumos
 			m_label_t *new_sl = NULL;
 			char *ascii = NULL;	/* human readable label */
 
 			(void) strlcpy(propbuf,
 			    getprop_string(zhp, prop, &source), proplen);
 
 			if (literal || (strcasecmp(propbuf,
 			    ZFS_MLSLABEL_DEFAULT) == 0))
 				break;
 
 			/*
 			 * Try to translate the internal hex string to
 			 * human-readable output.  If there are any
 			 * problems just use the hex string.
 			 */
 
 			if (str_to_label(propbuf, &new_sl, MAC_LABEL,
 			    L_NO_CORRECTION, NULL) == -1) {
 				m_label_free(new_sl);
 				break;
 			}
 
 			if (label_to_str(new_sl, &ascii, M_LABEL,
 			    DEF_NAMES) != 0) {
 				if (ascii)
 					free(ascii);
 				m_label_free(new_sl);
 				break;
 			}
 			m_label_free(new_sl);
 
 			(void) strlcpy(propbuf, ascii, proplen);
 			free(ascii);
 #else	/* !illumos */
 			propbuf[0] = '\0';
 #endif	/* illumos */
 		}
 		break;
 
 	case ZFS_PROP_GUID:
 		/*
 		 * GUIDs are stored as numbers, but they are identifiers.
 		 * We don't want them to be pretty printed, because pretty
 		 * printing mangles the ID into a truncated and useless value.
 		 */
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		(void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val);
 		break;
 
 	default:
 		switch (zfs_prop_get_type(prop)) {
 		case PROP_TYPE_NUMBER:
 			if (get_numeric_property(zhp, prop, src,
 			    &source, &val) != 0)
 				return (-1);
 			if (literal)
 				(void) snprintf(propbuf, proplen, "%llu",
 				    (u_longlong_t)val);
 			else
 				zfs_nicenum(val, propbuf, proplen);
 			break;
 
 		case PROP_TYPE_STRING:
 			str = getprop_string(zhp, prop, &source);
 			if (str == NULL)
 				return (-1);
 			(void) strlcpy(propbuf, str, proplen);
 			break;
 
 		case PROP_TYPE_INDEX:
 			if (get_numeric_property(zhp, prop, src,
 			    &source, &val) != 0)
 				return (-1);
 			if (zfs_prop_index_to_string(prop, val, &strval) != 0)
 				return (-1);
 			(void) strlcpy(propbuf, strval, proplen);
 			break;
 
 		default:
 			abort();
 		}
 	}
 
 	get_source(zhp, src, source, statbuf, statlen);
 
 	return (0);
 }
 
 /*
  * Utility function to get the given numeric property.  Does no validation that
  * the given property is the appropriate type; should only be used with
  * hard-coded property types.
  */
 uint64_t
 zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop)
 {
 	char *source;
 	uint64_t val;
 
 	(void) get_numeric_property(zhp, prop, NULL, &source, &val);
 
 	return (val);
 }
 
 int
 zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val)
 {
 	char buf[64];
 
 	(void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val);
 	return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf));
 }
 
 /*
  * Similar to zfs_prop_get(), but returns the value as an integer.
  */
 int
 zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value,
     zprop_source_t *src, char *statbuf, size_t statlen)
 {
 	char *source;
 
 	/*
 	 * Check to see if this property applies to our object
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) {
 		return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE,
 		    dgettext(TEXT_DOMAIN, "cannot get property '%s'"),
 		    zfs_prop_to_name(prop)));
 	}
 
 	if (src)
 		*src = ZPROP_SRC_NONE;
 
 	if (get_numeric_property(zhp, prop, src, &source, value) != 0)
 		return (-1);
 
 	get_source(zhp, src, source, statbuf, statlen);
 
 	return (0);
 }
 
 static int
 idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
     char **domainp, idmap_rid_t *ridp)
 {
 #ifdef illumos
 	idmap_get_handle_t *get_hdl = NULL;
 	idmap_stat status;
 	int err = EINVAL;
 
 	if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS)
 		goto out;
 
 	if (isuser) {
 		err = idmap_get_sidbyuid(get_hdl, id,
 		    IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
 	} else {
 		err = idmap_get_sidbygid(get_hdl, id,
 		    IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
 	}
 	if (err == IDMAP_SUCCESS &&
 	    idmap_get_mappings(get_hdl) == IDMAP_SUCCESS &&
 	    status == IDMAP_SUCCESS)
 		err = 0;
 	else
 		err = EINVAL;
 out:
 	if (get_hdl)
 		idmap_get_destroy(get_hdl);
 	return (err);
 #else	/* !illumos */
 	assert(!"invalid code path");
 	return (EINVAL); // silence compiler warning
 #endif	/* illumos */
 }
 
 /*
  * convert the propname into parameters needed by kernel
  * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829
  * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789
  */
 static int
 userquota_propname_decode(const char *propname, boolean_t zoned,
     zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp)
 {
 	zfs_userquota_prop_t type;
 	char *cp, *end;
 	char *numericsid = NULL;
 	boolean_t isuser;
 
 	domain[0] = '\0';
 	*ridp = 0;
 	/* Figure out the property type ({user|group}{quota|space}) */
 	for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
 		if (strncmp(propname, zfs_userquota_prop_prefixes[type],
 		    strlen(zfs_userquota_prop_prefixes[type])) == 0)
 			break;
 	}
 	if (type == ZFS_NUM_USERQUOTA_PROPS)
 		return (EINVAL);
 	*typep = type;
 
 	isuser = (type == ZFS_PROP_USERQUOTA ||
 	    type == ZFS_PROP_USERUSED);
 
 	cp = strchr(propname, '@') + 1;
 
 	if (strchr(cp, '@')) {
 #ifdef illumos
 		/*
 		 * It's a SID name (eg "user@domain") that needs to be
 		 * turned into S-1-domainID-RID.
 		 */
 		int flag = 0;
 		idmap_stat stat, map_stat;
 		uid_t pid;
 		idmap_rid_t rid;
 		idmap_get_handle_t *gh = NULL;
 
 		stat = idmap_get_create(&gh);
 		if (stat != IDMAP_SUCCESS) {
 			idmap_get_destroy(gh);
 			return (ENOMEM);
 		}
 		if (zoned && getzoneid() == GLOBAL_ZONEID)
 			return (ENOENT);
 		if (isuser) {
 			stat = idmap_getuidbywinname(cp, NULL, flag, &pid);
 			if (stat < 0)
 				return (ENOENT);
 			stat = idmap_get_sidbyuid(gh, pid, flag, &numericsid,
 			    &rid, &map_stat);
 		} else {
 			stat = idmap_getgidbywinname(cp, NULL, flag, &pid);
 			if (stat < 0)
 				return (ENOENT);
 			stat = idmap_get_sidbygid(gh, pid, flag, &numericsid,
 			    &rid, &map_stat);
 		}
 		if (stat < 0) {
 			idmap_get_destroy(gh);
 			return (ENOENT);
 		}
 		stat = idmap_get_mappings(gh);
 		idmap_get_destroy(gh);
 
 		if (stat < 0) {
 			return (ENOENT);
 		}
 		if (numericsid == NULL)
 			return (ENOENT);
 		cp = numericsid;
 		*ridp = rid;
 		/* will be further decoded below */
 #else	/* !illumos */
 		return (ENOENT);
 #endif	/* illumos */
 	}
 
 	if (strncmp(cp, "S-1-", 4) == 0) {
 		/* It's a numeric SID (eg "S-1-234-567-89") */
 		(void) strlcpy(domain, cp, domainlen);
 		errno = 0;
 		if (*ridp == 0) {
 			cp = strrchr(domain, '-');
 			*cp = '\0';
 			cp++;
 			*ridp = strtoull(cp, &end, 10);
 		} else {
 			end = "";
 		}
 		if (numericsid) {
 			free(numericsid);
 			numericsid = NULL;
 		}
 		if (errno != 0 || *end != '\0')
 			return (EINVAL);
 	} else if (!isdigit(*cp)) {
 		/*
 		 * It's a user/group name (eg "user") that needs to be
 		 * turned into a uid/gid
 		 */
 		if (zoned && getzoneid() == GLOBAL_ZONEID)
 			return (ENOENT);
 		if (isuser) {
 			struct passwd *pw;
 			pw = getpwnam(cp);
 			if (pw == NULL)
 				return (ENOENT);
 			*ridp = pw->pw_uid;
 		} else {
 			struct group *gr;
 			gr = getgrnam(cp);
 			if (gr == NULL)
 				return (ENOENT);
 			*ridp = gr->gr_gid;
 		}
 	} else {
 		/* It's a user/group ID (eg "12345"). */
 		uid_t id = strtoul(cp, &end, 10);
 		idmap_rid_t rid;
 		char *mapdomain;
 
 		if (*end != '\0')
 			return (EINVAL);
 		if (id > MAXUID) {
 			/* It's an ephemeral ID. */
 			if (idmap_id_to_numeric_domain_rid(id, isuser,
 			    &mapdomain, &rid) != 0)
 				return (ENOENT);
 			(void) strlcpy(domain, mapdomain, domainlen);
 			*ridp = rid;
 		} else {
 			*ridp = id;
 		}
 	}
 
 	ASSERT3P(numericsid, ==, NULL);
 	return (0);
 }
 
 static int
 zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue, zfs_userquota_prop_t *typep)
 {
 	int err;
 	zfs_cmd_t zc = { 0 };
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	err = userquota_propname_decode(propname,
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED),
 	    typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid);
 	zc.zc_objset_type = *typep;
 	if (err)
 		return (err);
 
 	err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc);
 	if (err)
 		return (err);
 
 	*propvalue = zc.zc_cookie;
 	return (0);
 }
 
 int
 zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue)
 {
 	zfs_userquota_prop_t type;
 
 	return (zfs_prop_get_userquota_common(zhp, propname, propvalue,
 	    &type));
 }
 
 int
 zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
     char *propbuf, int proplen, boolean_t literal)
 {
 	int err;
 	uint64_t propvalue;
 	zfs_userquota_prop_t type;
 
 	err = zfs_prop_get_userquota_common(zhp, propname, &propvalue,
 	    &type);
 
 	if (err)
 		return (err);
 
 	if (literal) {
 		(void) snprintf(propbuf, proplen, "%llu", propvalue);
 	} else if (propvalue == 0 &&
 	    (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA)) {
 		(void) strlcpy(propbuf, "none", proplen);
 	} else {
 		zfs_nicenum(propvalue, propbuf, proplen);
 	}
 	return (0);
 }
 
 int
 zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue)
 {
 	int err;
 	zfs_cmd_t zc = { 0 };
 	const char *snapname;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	snapname = strchr(propname, '@') + 1;
 	if (strchr(snapname, '@')) {
 		(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
 	} else {
 		/* snapname is the short name, append it to zhp's fsname */
 		char *cp;
 
 		(void) strlcpy(zc.zc_value, zhp->zfs_name,
 		    sizeof (zc.zc_value));
 		cp = strchr(zc.zc_value, '@');
 		if (cp != NULL)
 			*cp = '\0';
 		(void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value));
 		(void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value));
 	}
 
 	err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc);
 	if (err)
 		return (err);
 
 	*propvalue = zc.zc_cookie;
 	return (0);
 }
 
 int
 zfs_prop_get_written(zfs_handle_t *zhp, const char *propname,
     char *propbuf, int proplen, boolean_t literal)
 {
 	int err;
 	uint64_t propvalue;
 
 	err = zfs_prop_get_written_int(zhp, propname, &propvalue);
 
 	if (err)
 		return (err);
 
 	if (literal) {
 		(void) snprintf(propbuf, proplen, "%llu", propvalue);
 	} else {
 		zfs_nicenum(propvalue, propbuf, proplen);
 	}
 	return (0);
 }
 
 /*
  * Returns the name of the given zfs handle.
  */
 const char *
 zfs_get_name(const zfs_handle_t *zhp)
 {
 	return (zhp->zfs_name);
 }
 
 /*
  * Returns the type of the given zfs handle.
  */
 zfs_type_t
 zfs_get_type(const zfs_handle_t *zhp)
 {
 	return (zhp->zfs_type);
 }
 
 /*
  * Is one dataset name a child dataset of another?
  *
  * Needs to handle these cases:
  * Dataset 1	"a/foo"		"a/foo"		"a/foo"		"a/foo"
  * Dataset 2	"a/fo"		"a/foobar"	"a/bar/baz"	"a/foo/bar"
  * Descendant?	No.		No.		No.		Yes.
  */
 static boolean_t
 is_descendant(const char *ds1, const char *ds2)
 {
 	size_t d1len = strlen(ds1);
 
 	/* ds2 can't be a descendant if it's smaller */
 	if (strlen(ds2) < d1len)
 		return (B_FALSE);
 
 	/* otherwise, compare strings and verify that there's a '/' char */
 	return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0));
 }
 
 /*
  * Given a complete name, return just the portion that refers to the parent.
  * Will return -1 if there is no parent (path is just the name of the
  * pool).
  */
 static int
 parent_name(const char *path, char *buf, size_t buflen)
 {
 	char *slashp;
 
 	(void) strlcpy(buf, path, buflen);
 
 	if ((slashp = strrchr(buf, '/')) == NULL)
 		return (-1);
 	*slashp = '\0';
 
 	return (0);
 }
 
 /*
  * If accept_ancestor is false, then check to make sure that the given path has
  * a parent, and that it exists.  If accept_ancestor is true, then find the
  * closest existing ancestor for the given path.  In prefixlen return the
  * length of already existing prefix of the given path.  We also fetch the
  * 'zoned' property, which is used to validate property settings when creating
  * new datasets.
  */
 static int
 check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
     boolean_t accept_ancestor, int *prefixlen)
 {
 	zfs_cmd_t zc = { 0 };
 	char parent[ZFS_MAXNAMELEN];
 	char *slash;
 	zfs_handle_t *zhp;
 	char errbuf[1024];
 	uint64_t is_zoned;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);
 
 	/* get parent, and check to see if this is just a pool */
 	if (parent_name(path, parent, sizeof (parent)) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "missing dataset name"));
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	}
 
 	/* check to see if the pool exists */
 	if ((slash = strchr(parent, '/')) == NULL)
 		slash = parent + strlen(parent);
 	(void) strncpy(zc.zc_name, parent, slash - parent);
 	zc.zc_name[slash - parent] = '\0';
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
 	    errno == ENOENT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "no such pool '%s'"), zc.zc_name);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 
 	/* check to see if the parent dataset exists */
 	while ((zhp = make_dataset_handle(hdl, parent)) == NULL) {
 		if (errno == ENOENT && accept_ancestor) {
 			/*
 			 * Go deeper to find an ancestor, give up on top level.
 			 */
 			if (parent_name(parent, parent, sizeof (parent)) != 0) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "no such pool '%s'"), zc.zc_name);
 				return (zfs_error(hdl, EZFS_NOENT, errbuf));
 			}
 		} else if (errno == ENOENT) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "parent does not exist"));
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 		} else
 			return (zfs_standard_error(hdl, errno, errbuf));
 	}
 
 	is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 	if (zoned != NULL)
 		*zoned = is_zoned;
 
 	/* we are in a non-global zone, but parent is in the global zone */
 	if (getzoneid() != GLOBAL_ZONEID && !is_zoned) {
 		(void) zfs_standard_error(hdl, EPERM, errbuf);
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	/* make sure parent is a filesystem */
 	if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "parent is not a filesystem"));
 		(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	zfs_close(zhp);
 	if (prefixlen != NULL)
 		*prefixlen = strlen(parent);
 	return (0);
 }
 
 /*
  * Finds whether the dataset of the given type(s) exists.
  */
 boolean_t
 zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types)
 {
 	zfs_handle_t *zhp;
 
 	if (!zfs_validate_name(hdl, path, types, B_FALSE))
 		return (B_FALSE);
 
 	/*
 	 * Try to get stats for the dataset, which will tell us if it exists.
 	 */
 	if ((zhp = make_dataset_handle(hdl, path)) != NULL) {
 		int ds_type = zhp->zfs_type;
 
 		zfs_close(zhp);
 		if (types & ds_type)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Given a path to 'target', create all the ancestors between
  * the prefixlen portion of the path, and the target itself.
  * Fail if the initial prefixlen-ancestor does not already exist.
  */
 int
 create_parents(libzfs_handle_t *hdl, char *target, int prefixlen)
 {
 	zfs_handle_t *h;
 	char *cp;
 	const char *opname;
 
 	/* make sure prefix exists */
 	cp = target + prefixlen;
 	if (*cp != '/') {
 		assert(strchr(cp, '/') == NULL);
 		h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 	} else {
 		*cp = '\0';
 		h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 		*cp = '/';
 	}
 	if (h == NULL)
 		return (-1);
 	zfs_close(h);
 
 	/*
 	 * Attempt to create, mount, and share any ancestor filesystems,
 	 * up to the prefixlen-long one.
 	 */
 	for (cp = target + prefixlen + 1;
 	    cp = strchr(cp, '/'); *cp = '/', cp++) {
 
 		*cp = '\0';
 
 		h = make_dataset_handle(hdl, target);
 		if (h) {
 			/* it already exists, nothing to do here */
 			zfs_close(h);
 			continue;
 		}
 
 		if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM,
 		    NULL) != 0) {
 			opname = dgettext(TEXT_DOMAIN, "create");
 			goto ancestorerr;
 		}
 
 		h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 		if (h == NULL) {
 			opname = dgettext(TEXT_DOMAIN, "open");
 			goto ancestorerr;
 		}
 
 		if (zfs_mount(h, NULL, 0) != 0) {
 			opname = dgettext(TEXT_DOMAIN, "mount");
 			goto ancestorerr;
 		}
 
 		if (zfs_share(h) != 0) {
 			opname = dgettext(TEXT_DOMAIN, "share");
 			goto ancestorerr;
 		}
 
 		zfs_close(h);
 	}
 
 	return (0);
 
 ancestorerr:
 	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 	    "failed to %s ancestor '%s'"), opname, target);
 	return (-1);
 }
 
 /*
  * Creates non-existing ancestors of the given path.
  */
 int
 zfs_create_ancestors(libzfs_handle_t *hdl, const char *path)
 {
 	int prefix;
 	char *path_copy;
 	int rc;
 
 	if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0)
 		return (-1);
 
 	if ((path_copy = strdup(path)) != NULL) {
 		rc = create_parents(hdl, path_copy, prefix);
 		free(path_copy);
 	}
 	if (path_copy == NULL || rc != 0)
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Create a new filesystem or volume.
  */
 int
 zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
     nvlist_t *props)
 {
 	int ret;
 	uint64_t size = 0;
 	uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 	char errbuf[1024];
 	uint64_t zoned;
 	dmu_objset_type_t ost;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), path);
 
 	/* validate the path, taking care to note the extended error message */
 	if (!zfs_validate_name(hdl, path, type, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* validate parents exist */
 	if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0)
 		return (-1);
 
 	/*
 	 * The failure modes when creating a dataset of a different type over
 	 * one that already exists is a little strange.  In particular, if you
 	 * try to create a dataset on top of an existing dataset, the ioctl()
 	 * will return ENOENT, not EEXIST.  To prevent this from happening, we
 	 * first try to see if the dataset exists.
 	 */
 	if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset already exists"));
 		return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 	}
 
 	if (type == ZFS_TYPE_VOLUME)
 		ost = DMU_OST_ZVOL;
 	else
 		ost = DMU_OST_ZFS;
 
 	/* open zpool handle for prop validation */
 	char pool_path[MAXNAMELEN];
 	(void) strlcpy(pool_path, path, sizeof (pool_path));
 
 	/* truncate pool_path at first slash */
 	char *p = strchr(pool_path, '/');
 	if (p != NULL)
 		*p = '\0';
 
 	zpool_handle_t *zpool_handle = zpool_open(hdl, pool_path);
 
 	if (props && (props = zfs_valid_proplist(hdl, type, props,
 	    zoned, NULL, zpool_handle, errbuf)) == 0) {
 		zpool_close(zpool_handle);
 		return (-1);
 	}
 	zpool_close(zpool_handle);
 
 	if (type == ZFS_TYPE_VOLUME) {
 		/*
 		 * If we are creating a volume, the size and block size must
 		 * satisfy a few restraints.  First, the blocksize must be a
 		 * valid block size between SPA_{MIN,MAX}BLOCKSIZE.  Second, the
 		 * volsize must be a multiple of the block size, and cannot be
 		 * zero.
 		 */
 		if (props == NULL || nvlist_lookup_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "missing volume size"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		if ((ret = nvlist_lookup_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 		    &blocksize)) != 0) {
 			if (ret == ENOENT) {
 				blocksize = zfs_prop_default_numeric(
 				    ZFS_PROP_VOLBLOCKSIZE);
 			} else {
 				nvlist_free(props);
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "missing volume block size"));
 				return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 			}
 		}
 
 		if (size == 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume size cannot be zero"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		if (size % blocksize != 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume size must be a multiple of volume block "
 			    "size"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 	}
 
 	/* create the dataset */
 	ret = lzc_create(path, ost, props);
 	nvlist_free(props);
 
 	/* check for failure */
 	if (ret != 0) {
 		char parent[ZFS_MAXNAMELEN];
 		(void) parent_name(path, parent, sizeof (parent));
 
 		switch (errno) {
 		case ENOENT:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "no such parent '%s'"), parent);
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EINVAL:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "parent '%s' is not a filesystem"), parent);
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded to set this "
 			    "property or value"));
 			return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
 #ifdef _ILP32
 		case EOVERFLOW:
 			/*
 			 * This platform can't address a volume this big.
 			 */
 			if (type == ZFS_TYPE_VOLUME)
 				return (zfs_error(hdl, EZFS_VOLTOOBIG,
 				    errbuf));
 #endif
 			/* FALLTHROUGH */
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Destroys the given dataset.  The caller must make sure that the filesystem
  * isn't mounted, and that there are no active dependents. If the file system
  * does not exist this function does nothing.
  */
 int
 zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
 {
 	zfs_cmd_t zc = { 0 };
 
 	if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) {
 		nvlist_t *nv = fnvlist_alloc();
 		fnvlist_add_boolean(nv, zhp->zfs_name);
 		int error = lzc_destroy_bookmarks(nv, NULL);
 		fnvlist_free(nv);
 		if (error != 0) {
 			return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 			    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
 			    zhp->zfs_name));
 		}
 		return (0);
 	}
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (ZFS_IS_VOLUME(zhp)) {
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	} else {
 		zc.zc_objset_type = DMU_OST_ZFS;
 	}
 
 	zc.zc_defer_destroy = defer;
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0 &&
 	    errno != ENOENT) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
 		    zhp->zfs_name));
 	}
 
 	remove_mountpoint(zhp);
 
 	return (0);
 }
 
 struct destroydata {
 	nvlist_t *nvl;
 	const char *snapname;
 };
 
 static int
 zfs_check_snap_cb(zfs_handle_t *zhp, void *arg)
 {
 	struct destroydata *dd = arg;
 	char name[ZFS_MAXNAMELEN];
 	int rv = 0;
 
 	(void) snprintf(name, sizeof (name),
 	    "%s@%s", zhp->zfs_name, dd->snapname);
 
 	if (lzc_exists(name))
 		verify(nvlist_add_boolean(dd->nvl, name) == 0);
 
 	rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd);
 	zfs_close(zhp);
 	return (rv);
 }
 
 /*
  * Destroys all snapshots with the given name in zhp & descendants.
  */
 int
 zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
 {
 	int ret;
 	struct destroydata dd = { 0 };
 
 	dd.snapname = snapname;
 	verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0);
 	(void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd);
 
 	if (nvlist_empty(dd.nvl)) {
 		ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
 		    zhp->zfs_name, snapname);
 	} else {
 		ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer);
 	}
 	nvlist_free(dd.nvl);
 	return (ret);
 }
 
 /*
  * Destroys all the snapshots named in the nvlist.
  */
 int
 zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer)
 {
 	int ret;
 	nvlist_t *errlist;
 
 	ret = lzc_destroy_snaps(snaps, defer, &errlist);
 
 	if (ret == 0)
 		return (0);
 
 	if (nvlist_empty(errlist)) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot destroy snapshots"));
 
 		ret = zfs_standard_error(hdl, ret, errbuf);
 	}
 	for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
 		    nvpair_name(pair));
 
 		switch (fnvpair_value_int32(pair)) {
 		case EEXIST:
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "snapshot is cloned"));
 			ret = zfs_error(hdl, EZFS_EXISTS, errbuf);
 			break;
 		default:
 			ret = zfs_standard_error(hdl, errno, errbuf);
 			break;
 		}
 	}
 
 	return (ret);
 }
 
 /*
  * Clones the given dataset.  The target must be of the same type as the source.
  */
 int
 zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
 {
 	char parent[ZFS_MAXNAMELEN];
 	int ret;
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	uint64_t zoned;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), target);
 
 	/* validate the target/clone name */
 	if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* validate parents exist */
 	if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0)
 		return (-1);
 
 	(void) parent_name(target, parent, sizeof (parent));
 
 	/* do the clone */
 
 	if (props) {
 		zfs_type_t type;
 		if (ZFS_IS_VOLUME(zhp)) {
 			type = ZFS_TYPE_VOLUME;
 		} else {
 			type = ZFS_TYPE_FILESYSTEM;
 		}
 		if ((props = zfs_valid_proplist(hdl, type, props, zoned,
 		    zhp, zhp->zpool_hdl, errbuf)) == NULL)
 			return (-1);
 	}
 
 	ret = lzc_clone(target, zhp->zfs_name, props);
 	nvlist_free(props);
 
 	if (ret != 0) {
 		switch (errno) {
 
 		case ENOENT:
 			/*
 			 * The parent doesn't exist.  We should have caught this
 			 * above, but there may a race condition that has since
 			 * destroyed the parent.
 			 *
 			 * At this point, we don't know whether it's the source
 			 * that doesn't exist anymore, or whether the target
 			 * dataset doesn't exist.
 			 */
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "no such parent '%s'"), parent);
 			return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
 
 		case EXDEV:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "source and target pools differ"));
 			return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET,
 			    errbuf));
 
 		default:
 			return (zfs_standard_error(zhp->zfs_hdl, errno,
 			    errbuf));
 		}
 	}
 
 	return (ret);
 }
 
 /*
  * Promotes the given clone fs to be the clone parent.
  */
 int
 zfs_promote(zfs_handle_t *zhp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_cmd_t zc = { 0 };
 	char parent[MAXPATHLEN];
 	int ret;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot promote '%s'"), zhp->zfs_name);
 
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "snapshots can not be promoted"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 
 	(void) strlcpy(parent, zhp->zfs_dmustats.dds_origin, sizeof (parent));
 	if (parent[0] == '\0') {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "not a cloned filesystem"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 
 	(void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin,
 	    sizeof (zc.zc_value));
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	ret = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc);
 
 	if (ret != 0) {
 		int save_errno = errno;
 
 		switch (save_errno) {
 		case EEXIST:
 			/* There is a conflicting snapshot name. */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "conflicting snapshot '%s' from parent '%s'"),
 			    zc.zc_string, parent);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, save_errno, errbuf));
 		}
 	}
 	return (ret);
 }
 
 typedef struct snapdata {
 	nvlist_t *sd_nvl;
 	const char *sd_snapname;
 } snapdata_t;
 
 static int
 zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
 {
 	snapdata_t *sd = arg;
 	char name[ZFS_MAXNAMELEN];
 	int rv = 0;
 
 	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) {
 		(void) snprintf(name, sizeof (name),
 		    "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
 
 		fnvlist_add_boolean(sd->sd_nvl, name);
 
 		rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
 	}
 	zfs_close(zhp);
 
 	return (rv);
 }
 
 /*
  * Creates snapshots.  The keys in the snaps nvlist are the snapshots to be
  * created.
  */
 int
 zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props)
 {
 	int ret;
 	char errbuf[1024];
 	nvpair_t *elem;
 	nvlist_t *errors;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create snapshots "));
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) {
 		const char *snapname = nvpair_name(elem);
 
 		/* validate the target name */
 		if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT,
 		    B_TRUE)) {
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
 			    "cannot create snapshot '%s'"), snapname);
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 	}
 
 	/*
 	 * get pool handle for prop validation. assumes all snaps are in the
 	 * same pool, as does lzc_snapshot (below).
 	 */
 	char pool[MAXNAMELEN];
 	elem = nvlist_next_nvpair(snaps, NULL);
 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 	pool[strcspn(pool, "/@")] = '\0';
 	zpool_handle_t *zpool_hdl = zpool_open(hdl, pool);
 
 	if (props != NULL &&
 	    (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT,
 	    props, B_FALSE, NULL, zpool_hdl, errbuf)) == NULL) {
 		zpool_close(zpool_hdl);
 		return (-1);
 	}
 	zpool_close(zpool_hdl);
 
 	ret = lzc_snapshot(snaps, props, &errors);
 
 	if (ret != 0) {
 		boolean_t printed = B_FALSE;
 		for (elem = nvlist_next_nvpair(errors, NULL);
 		    elem != NULL;
 		    elem = nvlist_next_nvpair(errors, elem)) {
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
 			    "cannot create snapshot '%s'"), nvpair_name(elem));
 			(void) zfs_standard_error(hdl,
 			    fnvpair_value_int32(elem), errbuf);
 			printed = B_TRUE;
 		}
 		if (!printed) {
 			switch (ret) {
 			case EXDEV:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "multiple snapshots of same "
 				    "fs not allowed"));
 				(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
 
 				break;
 			default:
 				(void) zfs_standard_error(hdl, ret, errbuf);
 			}
 		}
 	}
 
 	nvlist_free(props);
 	nvlist_free(errors);
 	return (ret);
 }
 
 int
 zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive,
     nvlist_t *props)
 {
 	int ret;
 	snapdata_t sd = { 0 };
 	char fsname[ZFS_MAXNAMELEN];
 	char *cp;
 	zfs_handle_t *zhp;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot snapshot %s"), path);
 
 	if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	(void) strlcpy(fsname, path, sizeof (fsname));
 	cp = strchr(fsname, '@');
 	*cp = '\0';
 	sd.sd_snapname = cp + 1;
 
 	if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM |
 	    ZFS_TYPE_VOLUME)) == NULL) {
 		return (-1);
 	}
 
 	verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0);
 	if (recursive) {
 		(void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd);
 	} else {
 		fnvlist_add_boolean(sd.sd_nvl, path);
 	}
 
 	ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props);
 	nvlist_free(sd.sd_nvl);
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * Destroy any more recent snapshots.  We invoke this callback on any dependents
  * of the snapshot first.  If the 'cb_dependent' member is non-zero, then this
  * is a dependent and we should just destroy it without checking the transaction
  * group.
  */
 typedef struct rollback_data {
 	const char	*cb_target;		/* the snapshot */
 	uint64_t	cb_create;		/* creation time reference */
 	boolean_t	cb_error;
 	boolean_t	cb_force;
 } rollback_data_t;
 
 static int
 rollback_destroy_dependent(zfs_handle_t *zhp, void *data)
 {
 	rollback_data_t *cbp = data;
 	prop_changelist_t *clp;
 
 	/* We must destroy this clone; first unmount it */
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 	    cbp->cb_force ? MS_FORCE: 0);
 	if (clp == NULL || changelist_prefix(clp) != 0) {
 		cbp->cb_error = B_TRUE;
 		zfs_close(zhp);
 		return (0);
 	}
 	if (zfs_destroy(zhp, B_FALSE) != 0)
 		cbp->cb_error = B_TRUE;
 	else
 		changelist_remove(clp, zhp->zfs_name);
 	(void) changelist_postfix(clp);
 	changelist_free(clp);
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 rollback_destroy(zfs_handle_t *zhp, void *data)
 {
 	rollback_data_t *cbp = data;
 
 	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
 		cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE,
 		    rollback_destroy_dependent, cbp);
 
 		cbp->cb_error |= zfs_destroy(zhp, B_FALSE);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Given a dataset, rollback to a specific snapshot, discarding any
  * data changes since then and making it the active dataset.
  *
  * Any snapshots and bookmarks more recent than the target are
  * destroyed, along with their dependents (i.e. clones).
  */
 int
 zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
 {
 	rollback_data_t cb = { 0 };
 	int err;
 	boolean_t restore_resv = 0;
 	uint64_t old_volsize, new_volsize;
 	zfs_prop_t resv_prop;
 
 	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM ||
 	    zhp->zfs_type == ZFS_TYPE_VOLUME);
 
 	/*
 	 * Destroy all recent snapshots and their dependents.
 	 */
 	cb.cb_force = force;
 	cb.cb_target = snap->zfs_name;
 	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
 	(void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb);
 	(void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb);
 
 	if (cb.cb_error)
 		return (-1);
 
 	/*
 	 * Now that we have verified that the snapshot is the latest,
 	 * rollback to the given snapshot.
 	 */
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
 			return (-1);
 		old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 		restore_resv =
 		    (old_volsize == zfs_prop_get_int(zhp, resv_prop));
 	}
 
 	/*
 	 * We rely on zfs_iter_children() to verify that there are no
 	 * newer snapshots for the given dataset.  Therefore, we can
 	 * simply pass the name on to the ioctl() call.  There is still
 	 * an unlikely race condition where the user has taken a
 	 * snapshot since we verified that this was the most recent.
 	 */
 	err = lzc_rollback(zhp->zfs_name, NULL, 0);
 	if (err != 0) {
 		(void) zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot rollback '%s'"),
 		    zhp->zfs_name);
 		return (err);
 	}
 
 	/*
 	 * For volumes, if the pre-rollback volsize matched the pre-
 	 * rollback reservation and the volsize has changed then set
 	 * the reservation property to the post-rollback volsize.
 	 * Make a new handle since the rollback closed the dataset.
 	 */
 	if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
 	    (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
 		if (restore_resv) {
 			new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 			if (old_volsize != new_volsize)
 				err = zfs_prop_set_int(zhp, resv_prop,
 				    new_volsize);
 		}
 		zfs_close(zhp);
 	}
 	return (err);
 }
 
 /*
  * Renames the given dataset.
  */
 int
 zfs_rename(zfs_handle_t *zhp, const char *source, const char *target,
     renameflags_t flags)
 {
 	int ret;
 	zfs_cmd_t zc = { 0 };
 	char *delim;
 	prop_changelist_t *cl = NULL;
 	zfs_handle_t *zhrp = NULL;
 	char *parentname = NULL;
 	char parent[ZFS_MAXNAMELEN];
 	char property[ZFS_MAXPROPLEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
 
 	/* if we have the same exact name, just return success */
 	if (strcmp(zhp->zfs_name, target) == 0)
 		return (0);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot rename to '%s'"), target);
 
 	if (source != NULL) {
 		/*
 		 * This is recursive snapshots rename, put snapshot name
 		 * (that might not exist) into zfs_name.
 		 */
 		assert(flags.recurse);
 
 		(void) strlcat(zhp->zfs_name, "@", sizeof(zhp->zfs_name));
 		(void) strlcat(zhp->zfs_name, source, sizeof(zhp->zfs_name));
 		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
 	}
 
 	/*
 	 * Make sure the target name is valid
 	 */
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		if ((strchr(target, '@') == NULL) ||
 		    *target == '@') {
 			/*
 			 * Snapshot target name is abbreviated,
 			 * reconstruct full dataset name
 			 */
 			(void) strlcpy(parent, zhp->zfs_name,
 			    sizeof (parent));
 			delim = strchr(parent, '@');
 			if (strchr(target, '@') == NULL)
 				*(++delim) = '\0';
 			else
 				*delim = '\0';
 			(void) strlcat(parent, target, sizeof (parent));
 			target = parent;
 		} else {
 			/*
 			 * Make sure we're renaming within the same dataset.
 			 */
 			delim = strchr(target, '@');
 			if (strncmp(zhp->zfs_name, target, delim - target)
 			    != 0 || zhp->zfs_name[delim - target] != '@') {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "snapshots must be part of same "
 				    "dataset"));
 				return (zfs_error(hdl, EZFS_CROSSTARGET,
 				    errbuf));
 			}
 		}
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	} else {
 		if (flags.recurse) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "recursive rename must be a snapshot"));
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 		}
 
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 		/* validate parents */
 		if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0)
 			return (-1);
 
 		/* make sure we're in the same pool */
 		verify((delim = strchr(target, '/')) != NULL);
 		if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
 		    zhp->zfs_name[delim - target] != '/') {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "datasets must be within same pool"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 		}
 
 		/* new name cannot be a child of the current dataset name */
 		if (is_descendant(zhp->zfs_name, target)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "New dataset name cannot be a descendant of "
 			    "current dataset name"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 	}
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name);
 
 	if (getzoneid() == GLOBAL_ZONEID &&
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is used in a non-global zone"));
 		return (zfs_error(hdl, EZFS_ZONED, errbuf));
 	}
 
 	/*
 	 * Avoid unmounting file systems with mountpoint property set to
 	 * 'legacy' or 'none' even if -u option is not given.
 	 */
 	if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
 	    !flags.recurse && !flags.nounmount &&
 	    zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property,
 	    sizeof (property), NULL, NULL, 0, B_FALSE) == 0 &&
 	    (strcmp(property, "legacy") == 0 ||
 	     strcmp(property, "none") == 0)) {
 		flags.nounmount = B_TRUE;
 	}
 	if (flags.recurse) {
 
 		parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
 		if (parentname == NULL) {
 			ret = -1;
 			goto error;
 		}
 		delim = strchr(parentname, '@');
 		*delim = '\0';
 		zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET);
 		if (zhrp == NULL) {
 			ret = -1;
 			goto error;
 		}
 	} else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) {
 		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME,
 		    flags.nounmount ? CL_GATHER_DONT_UNMOUNT : 0,
 		    flags.forceunmount ? MS_FORCE : 0)) == NULL) {
 			return (-1);
 		}
 
 		if (changelist_haszonedchild(cl)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "child dataset with inherited mountpoint is used "
 			    "in a non-global zone"));
 			(void) zfs_error(hdl, EZFS_ZONED, errbuf);
 			goto error;
 		}
 
 		if ((ret = changelist_prefix(cl)) != 0)
 			goto error;
 	}
 
 	if (ZFS_IS_VOLUME(zhp))
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
 
 	zc.zc_cookie = flags.recurse ? 1 : 0;
 	if (flags.nounmount)
 		zc.zc_cookie |= 2;
 
 	if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) {
 		/*
 		 * if it was recursive, the one that actually failed will
 		 * be in zc.zc_name
 		 */
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot rename '%s'"), zc.zc_name);
 
 		if (flags.recurse && errno == EEXIST) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "a child dataset already has a snapshot "
 			    "with the new name"));
 			(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
 		} else {
 			(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
 		}
 
 		/*
 		 * On failure, we still want to remount any filesystems that
 		 * were previously mounted, so we don't alter the system state.
 		 */
 		if (cl != NULL)
 			(void) changelist_postfix(cl);
 	} else {
 		if (cl != NULL) {
 			changelist_rename(cl, zfs_get_name(zhp), target);
 			ret = changelist_postfix(cl);
 		}
 	}
 
 error:
 	if (parentname != NULL) {
 		free(parentname);
 	}
 	if (zhrp != NULL) {
 		zfs_close(zhrp);
 	}
 	if (cl != NULL) {
 		changelist_free(cl);
 	}
 	return (ret);
 }
 
 nvlist_t *
 zfs_get_user_props(zfs_handle_t *zhp)
 {
 	return (zhp->zfs_user_props);
 }
 
 nvlist_t *
 zfs_get_recvd_props(zfs_handle_t *zhp)
 {
 	if (zhp->zfs_recvd_props == NULL)
 		if (get_recvd_props_ioctl(zhp) != 0)
 			return (NULL);
 	return (zhp->zfs_recvd_props);
 }
 
 /*
  * This function is used by 'zfs list' to determine the exact set of columns to
  * display, and their maximum widths.  This does two main things:
  *
  *      - If this is a list of all properties, then expand the list to include
  *        all native properties, and set a flag so that for each dataset we look
  *        for new unique user properties and add them to the list.
  *
  *      - For non fixed-width properties, keep track of the maximum width seen
  *        so that we can size the column appropriately. If the user has
  *        requested received property values, we also need to compute the width
  *        of the RECEIVED column.
  */
 int
 zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received,
     boolean_t literal)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zprop_list_t *entry;
 	zprop_list_t **last, **start;
 	nvlist_t *userprops, *propval;
 	nvpair_t *elem;
 	char *strval;
 	char buf[ZFS_MAXPROPLEN];
 
 	if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0)
 		return (-1);
 
 	userprops = zfs_get_user_props(zhp);
 
 	entry = *plp;
 	if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) {
 		/*
 		 * Go through and add any user properties as necessary.  We
 		 * start by incrementing our list pointer to the first
 		 * non-native property.
 		 */
 		start = plp;
 		while (*start != NULL) {
 			if ((*start)->pl_prop == ZPROP_INVAL)
 				break;
 			start = &(*start)->pl_next;
 		}
 
 		elem = NULL;
 		while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) {
 			/*
 			 * See if we've already found this property in our list.
 			 */
 			for (last = start; *last != NULL;
 			    last = &(*last)->pl_next) {
 				if (strcmp((*last)->pl_user_prop,
 				    nvpair_name(elem)) == 0)
 					break;
 			}
 
 			if (*last == NULL) {
 				if ((entry = zfs_alloc(hdl,
 				    sizeof (zprop_list_t))) == NULL ||
 				    ((entry->pl_user_prop = zfs_strdup(hdl,
 				    nvpair_name(elem)))) == NULL) {
 					free(entry);
 					return (-1);
 				}
 
 				entry->pl_prop = ZPROP_INVAL;
 				entry->pl_width = strlen(nvpair_name(elem));
 				entry->pl_all = B_TRUE;
 				*last = entry;
 			}
 		}
 	}
 
 	/*
 	 * Now go through and check the width of any non-fixed columns
 	 */
 	for (entry = *plp; entry != NULL; entry = entry->pl_next) {
 		if (entry->pl_fixed && !literal)
 			continue;
 
 		if (entry->pl_prop != ZPROP_INVAL) {
 			if (zfs_prop_get(zhp, entry->pl_prop,
 			    buf, sizeof (buf), NULL, NULL, 0, literal) == 0) {
 				if (strlen(buf) > entry->pl_width)
 					entry->pl_width = strlen(buf);
 			}
 			if (received && zfs_prop_get_recvd(zhp,
 			    zfs_prop_to_name(entry->pl_prop),
 			    buf, sizeof (buf), literal) == 0)
 				if (strlen(buf) > entry->pl_recvd_width)
 					entry->pl_recvd_width = strlen(buf);
 		} else {
 			if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop,
 			    &propval) == 0) {
 				verify(nvlist_lookup_string(propval,
 				    ZPROP_VALUE, &strval) == 0);
 				if (strlen(strval) > entry->pl_width)
 					entry->pl_width = strlen(strval);
 			}
 			if (received && zfs_prop_get_recvd(zhp,
 			    entry->pl_user_prop,
 			    buf, sizeof (buf), literal) == 0)
 				if (strlen(buf) > entry->pl_recvd_width)
 					entry->pl_recvd_width = strlen(buf);
 		}
 	}
 
 	return (0);
 }
 
 int
 zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path,
     char *resource, void *export, void *sharetab,
     int sharemax, zfs_share_op_t operation)
 {
 	zfs_cmd_t zc = { 0 };
 	int error;
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
 	if (resource)
 		(void) strlcpy(zc.zc_string, resource, sizeof (zc.zc_string));
 	zc.zc_share.z_sharedata = (uint64_t)(uintptr_t)sharetab;
 	zc.zc_share.z_exportdata = (uint64_t)(uintptr_t)export;
 	zc.zc_share.z_sharetype = operation;
 	zc.zc_share.z_sharemax = sharemax;
 	error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc);
 	return (error);
 }
 
 void
 zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props)
 {
 	nvpair_t *curr;
 
 	/*
 	 * Keep a reference to the props-table against which we prune the
 	 * properties.
 	 */
 	zhp->zfs_props_table = props;
 
 	curr = nvlist_next_nvpair(zhp->zfs_props, NULL);
 
 	while (curr) {
 		zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr));
 		nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr);
 
 		/*
 		 * User properties will result in ZPROP_INVAL, and since we
 		 * only know how to prune standard ZFS properties, we always
 		 * leave these in the list.  This can also happen if we
 		 * encounter an unknown DSL property (when running older
 		 * software, for example).
 		 */
 		if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE)
 			(void) nvlist_remove(zhp->zfs_props,
 			    nvpair_name(curr), nvpair_type(curr));
 		curr = next;
 	}
 }
 
 #ifdef illumos
 static int
 zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path,
     zfs_smb_acl_op_t cmd, char *resource1, char *resource2)
 {
 	zfs_cmd_t zc = { 0 };
 	nvlist_t *nvlist = NULL;
 	int error;
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
 	zc.zc_cookie = (uint64_t)cmd;
 
 	if (cmd == ZFS_SMB_ACL_RENAME) {
 		if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) {
 			(void) no_memory(hdl);
 			return (0);
 		}
 	}
 
 	switch (cmd) {
 	case ZFS_SMB_ACL_ADD:
 	case ZFS_SMB_ACL_REMOVE:
 		(void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string));
 		break;
 	case ZFS_SMB_ACL_RENAME:
 		if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC,
 		    resource1) != 0) {
 				(void) no_memory(hdl);
 				return (-1);
 		}
 		if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET,
 		    resource2) != 0) {
 				(void) no_memory(hdl);
 				return (-1);
 		}
 		if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) {
 			nvlist_free(nvlist);
 			return (-1);
 		}
 		break;
 	case ZFS_SMB_ACL_PURGE:
 		break;
 	default:
 		return (-1);
 	}
 	error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc);
 	if (nvlist)
 		nvlist_free(nvlist);
 	return (error);
 }
 
 int
 zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset,
     char *path, char *resource)
 {
 	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD,
 	    resource, NULL));
 }
 
 int
 zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset,
     char *path, char *resource)
 {
 	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE,
 	    resource, NULL));
 }
 
 int
 zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path)
 {
 	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE,
 	    NULL, NULL));
 }
 
 int
 zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path,
     char *oldname, char *newname)
 {
 	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME,
 	    oldname, newname));
 }
 #endif	/* illumos */
 
 int
 zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
     zfs_userspace_cb_t func, void *arg)
 {
 	zfs_cmd_t zc = { 0 };
 	zfs_useracct_t buf[100];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	int ret;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	zc.zc_objset_type = type;
 	zc.zc_nvlist_dst = (uintptr_t)buf;
 
 	for (;;) {
 		zfs_useracct_t *zua = buf;
 
 		zc.zc_nvlist_dst_size = sizeof (buf);
 		if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) {
 			char errbuf[1024];
 
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
 			    "cannot get used/quota for %s"), zc.zc_name);
 			return (zfs_standard_error_fmt(hdl, errno, errbuf));
 		}
 		if (zc.zc_nvlist_dst_size == 0)
 			break;
 
 		while (zc.zc_nvlist_dst_size > 0) {
 			if ((ret = func(arg, zua->zu_domain, zua->zu_rid,
 			    zua->zu_space)) != 0)
 				return (ret);
 			zua++;
 			zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t);
 		}
 	}
 
 	return (0);
 }
 
 struct holdarg {
 	nvlist_t *nvl;
 	const char *snapname;
 	const char *tag;
 	boolean_t recursive;
 	int error;
 };
 
 static int
 zfs_hold_one(zfs_handle_t *zhp, void *arg)
 {
 	struct holdarg *ha = arg;
 	char name[ZFS_MAXNAMELEN];
 	int rv = 0;
 
 	(void) snprintf(name, sizeof (name),
 	    "%s@%s", zhp->zfs_name, ha->snapname);
 
 	if (lzc_exists(name))
 		fnvlist_add_string(ha->nvl, name, ha->tag);
 
 	if (ha->recursive)
 		rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha);
 	zfs_close(zhp);
 	return (rv);
 }
 
 int
 zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
     boolean_t recursive, int cleanup_fd)
 {
 	int ret;
 	struct holdarg ha;
 
 	ha.nvl = fnvlist_alloc();
 	ha.snapname = snapname;
 	ha.tag = tag;
 	ha.recursive = recursive;
 	(void) zfs_hold_one(zfs_handle_dup(zhp), &ha);
 
 	if (nvlist_empty(ha.nvl)) {
 		char errbuf[1024];
 
 		fnvlist_free(ha.nvl);
 		ret = ENOENT;
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN,
 		    "cannot hold snapshot '%s@%s'"),
 		    zhp->zfs_name, snapname);
 		(void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf);
 		return (ret);
 	}
 
 	ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl);
 	fnvlist_free(ha.nvl);
 
 	return (ret);
 }
 
 int
 zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds)
 {
 	int ret;
 	nvlist_t *errors;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
 	nvpair_t *elem;
 
 	errors = NULL;
 	ret = lzc_hold(holds, cleanup_fd, &errors);
 
 	if (ret == 0) {
 		/* There may be errors even in the success case. */
 		fnvlist_free(errors);
 		return (0);
 	}
 
 	if (nvlist_empty(errors)) {
 		/* no hold-specific errors */
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot hold"));
 		switch (ret) {
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl, ret, errbuf);
 		}
 	}
 
 	for (elem = nvlist_next_nvpair(errors, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(errors, elem)) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN,
 		    "cannot hold snapshot '%s'"), nvpair_name(elem));
 		switch (fnvpair_value_int32(elem)) {
 		case E2BIG:
 			/*
 			 * Temporary tags wind up having the ds object id
 			 * prepended. So even if we passed the length check
 			 * above, it's still possible for the tag to wind
 			 * up being slightly too long.
 			 */
 			(void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf);
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		case EEXIST:
 			(void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl,
 			    fnvpair_value_int32(elem), errbuf);
 		}
 	}
 
 	fnvlist_free(errors);
 	return (ret);
 }
 
 static int
 zfs_release_one(zfs_handle_t *zhp, void *arg)
 {
 	struct holdarg *ha = arg;
 	char name[ZFS_MAXNAMELEN];
 	int rv = 0;
 	nvlist_t *existing_holds;
 
 	(void) snprintf(name, sizeof (name),
 	    "%s@%s", zhp->zfs_name, ha->snapname);
 
 	if (lzc_get_holds(name, &existing_holds) != 0) {
 		ha->error = ENOENT;
 	} else if (!nvlist_exists(existing_holds, ha->tag)) {
 		ha->error = ESRCH;
 	} else {
 		nvlist_t *torelease = fnvlist_alloc();
 		fnvlist_add_boolean(torelease, ha->tag);
 		fnvlist_add_nvlist(ha->nvl, name, torelease);
 		fnvlist_free(torelease);
 	}
 
 	if (ha->recursive)
 		rv = zfs_iter_filesystems(zhp, zfs_release_one, ha);
 	zfs_close(zhp);
 	return (rv);
 }
 
 int
 zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
     boolean_t recursive)
 {
 	int ret;
 	struct holdarg ha;
 	nvlist_t *errors = NULL;
 	nvpair_t *elem;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
 
 	ha.nvl = fnvlist_alloc();
 	ha.snapname = snapname;
 	ha.tag = tag;
 	ha.recursive = recursive;
 	ha.error = 0;
 	(void) zfs_release_one(zfs_handle_dup(zhp), &ha);
 
 	if (nvlist_empty(ha.nvl)) {
 		fnvlist_free(ha.nvl);
 		ret = ha.error;
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN,
 		    "cannot release hold from snapshot '%s@%s'"),
 		    zhp->zfs_name, snapname);
 		if (ret == ESRCH) {
 			(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, ret, errbuf);
 		}
 		return (ret);
 	}
 
 	ret = lzc_release(ha.nvl, &errors);
 	fnvlist_free(ha.nvl);
 
 	if (ret == 0) {
 		/* There may be errors even in the success case. */
 		fnvlist_free(errors);
 		return (0);
 	}
 
 	if (nvlist_empty(errors)) {
 		/* no hold-specific errors */
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot release"));
 		switch (errno) {
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error_fmt(hdl, errno, errbuf);
 		}
 	}
 
 	for (elem = nvlist_next_nvpair(errors, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(errors, elem)) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN,
 		    "cannot release hold from snapshot '%s'"),
 		    nvpair_name(elem));
 		switch (fnvpair_value_int32(elem)) {
 		case ESRCH:
 			(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error_fmt(hdl,
 			    fnvpair_value_int32(elem), errbuf);
 		}
 	}
 
 	fnvlist_free(errors);
 	return (ret);
 }
 
 int
 zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	int nvsz = 2048;
 	void *nvbuf;
 	int err = 0;
 	char errbuf[1024];
 
 	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
 	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
 
 tryagain:
 
 	nvbuf = malloc(nvsz);
 	if (nvbuf == NULL) {
 		err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
 		goto out;
 	}
 
 	zc.zc_nvlist_dst_size = nvsz;
 	zc.zc_nvlist_dst = (uintptr_t)nvbuf;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN);
 
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"),
 		    zc.zc_name);
 		switch (errno) {
 		case ENOMEM:
 			free(nvbuf);
 			nvsz = zc.zc_nvlist_dst_size;
 			goto tryagain;
 
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EINVAL:
 			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		case ENOENT:
 			err = zfs_error(hdl, EZFS_NOENT, errbuf);
 			break;
 		default:
 			err = zfs_standard_error_fmt(hdl, errno, errbuf);
 			break;
 		}
 	} else {
 		/* success */
 		int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
 		if (rc) {
 			(void) snprintf(errbuf, sizeof (errbuf), dgettext(
 			    TEXT_DOMAIN, "cannot get permissions on '%s'"),
 			    zc.zc_name);
 			err = zfs_standard_error_fmt(hdl, rc, errbuf);
 		}
 	}
 
 	free(nvbuf);
 out:
 	return (err);
 }
 
 int
 zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char *nvbuf;
 	char errbuf[1024];
 	size_t nvsz;
 	int err;
 
 	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
 	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
 
 	err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE);
 	assert(err == 0);
 
 	nvbuf = malloc(nvsz);
 
 	err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0);
 	assert(err == 0);
 
 	zc.zc_nvlist_src_size = nvsz;
 	zc.zc_nvlist_src = (uintptr_t)nvbuf;
 	zc.zc_perm_action = un;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"),
 		    zc.zc_name);
 		switch (errno) {
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EINVAL:
 			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		case ENOENT:
 			err = zfs_error(hdl, EZFS_NOENT, errbuf);
 			break;
 		default:
 			err = zfs_standard_error_fmt(hdl, errno, errbuf);
 			break;
 		}
 	}
 
 	free(nvbuf);
 
 	return (err);
 }
 
 int
 zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
 {
 	int err;
 	char errbuf[1024];
 
 	err = lzc_get_holds(zhp->zfs_name, nvl);
 
 	if (err != 0) {
 		libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
 		    zhp->zfs_name);
 		switch (err) {
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EINVAL:
 			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		case ENOENT:
 			err = zfs_error(hdl, EZFS_NOENT, errbuf);
 			break;
 		default:
 			err = zfs_standard_error_fmt(hdl, errno, errbuf);
 			break;
 		}
 	}
 
 	return (err);
 }
 
 /*
  * Convert the zvol's volume size to an appropriate reservation.
  * Note: If this routine is updated, it is necessary to update the ZFS test
  * suite's shell version in reservation.kshlib.
  */
 uint64_t
 zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
 {
 	uint64_t numdb;
 	uint64_t nblocks, volblocksize;
 	int ncopies;
 	char *strval;
 
 	if (nvlist_lookup_string(props,
 	    zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0)
 		ncopies = atoi(strval);
 	else
 		ncopies = 1;
 	if (nvlist_lookup_uint64(props,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 	    &volblocksize) != 0)
 		volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
 	nblocks = volsize/volblocksize;
 	/* start with metadnode L0-L6 */
 	numdb = 7;
 	/* calculate number of indirects */
 	while (nblocks > 1) {
 		nblocks += DNODES_PER_LEVEL - 1;
 		nblocks /= DNODES_PER_LEVEL;
 		numdb += nblocks;
 	}
 	numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1);
 	volsize *= ncopies;
 	/*
 	 * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't
 	 * compressed, but in practice they compress down to about
 	 * 1100 bytes
 	 */
 	numdb *= 1ULL << DN_MAX_INDBLKSHIFT;
 	volsize += numdb;
 	return (volsize);
 }
 
 /*
  * Attach/detach the given filesystem to/from the given jail.
  */
 int
 zfs_jail(zfs_handle_t *zhp, int jailid, int attach)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_cmd_t zc = { 0 };
 	char errbuf[1024];
 	unsigned long cmd;
 	int ret;
 
 	if (attach) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name);
 	} else {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name);
 	}
 
 	switch (zhp->zfs_type) {
 	case ZFS_TYPE_VOLUME:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "volumes can not be jailed"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	case ZFS_TYPE_SNAPSHOT:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "snapshots can not be jailed"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	zc.zc_objset_type = DMU_OST_ZFS;
 	zc.zc_jailid = jailid;
 
 	cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL;
 	if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0)
 		zfs_standard_error(hdl, errno, errbuf);
 
 	return (ret);
 }
Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c	(revision 296518)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c	(revision 296519)
@@ -1,3686 +1,3687 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <stddef.h>
 #include <fcntl.h>
 #include <sys/param.h>
 #include <sys/mount.h>
 #include <pthread.h>
 #include <umem.h>
 #include <time.h>
 
 #include <libzfs.h>
 #include <libzfs_core.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "zfs_fletcher.h"
 #include "libzfs_impl.h"
 #include <zlib.h>
 #include <sha2.h>
 #include <sys/zio_checksum.h>
 #include <sys/ddt.h>
 
 #ifdef __FreeBSD__
 extern int zfs_ioctl_version;
 #endif
 
 /* in libzfs_dataset.c */
 extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
 /* We need to use something for ENODATA. */
 #define	ENODATA	EIDRM
 
 static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
     recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int,
     uint64_t *, const char *);
 static int guid_to_name(libzfs_handle_t *, const char *,
     uint64_t, boolean_t, char *);
 
 static const zio_cksum_t zero_cksum = { 0 };
 
 typedef struct dedup_arg {
 	int	inputfd;
 	int	outputfd;
 	libzfs_handle_t  *dedup_hdl;
 } dedup_arg_t;
 
 typedef struct progress_arg {
 	zfs_handle_t *pa_zhp;
 	int pa_fd;
 	boolean_t pa_parsable;
 } progress_arg_t;
 
 typedef struct dataref {
 	uint64_t ref_guid;
 	uint64_t ref_object;
 	uint64_t ref_offset;
 } dataref_t;
 
 typedef struct dedup_entry {
 	struct dedup_entry	*dde_next;
 	zio_cksum_t dde_chksum;
 	uint64_t dde_prop;
 	dataref_t dde_ref;
 } dedup_entry_t;
 
 #define	MAX_DDT_PHYSMEM_PERCENT		20
 #define	SMALLEST_POSSIBLE_MAX_DDT_MB		128
 
 typedef struct dedup_table {
 	dedup_entry_t	**dedup_hash_array;
 	umem_cache_t	*ddecache;
 	uint64_t	max_ddt_size;  /* max dedup table size in bytes */
 	uint64_t	cur_ddt_size;  /* current dedup table size in bytes */
 	uint64_t	ddt_count;
 	int		numhashbits;
 	boolean_t	ddt_full;
 } dedup_table_t;
 
 static int
 high_order_bit(uint64_t n)
 {
 	int count;
 
 	for (count = 0; n != 0; count++)
 		n >>= 1;
 	return (count);
 }
 
 static size_t
 ssread(void *buf, size_t len, FILE *stream)
 {
 	size_t outlen;
 
 	if ((outlen = fread(buf, len, 1, stream)) == 0)
 		return (0);
 
 	return (outlen);
 }
 
 static void
 ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp,
     zio_cksum_t *cs, uint64_t prop, dataref_t *dr)
 {
 	dedup_entry_t	*dde;
 
 	if (ddt->cur_ddt_size >= ddt->max_ddt_size) {
 		if (ddt->ddt_full == B_FALSE) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "Dedup table full.  Deduplication will continue "
 			    "with existing table entries"));
 			ddt->ddt_full = B_TRUE;
 		}
 		return;
 	}
 
 	if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT))
 	    != NULL) {
 		assert(*ddepp == NULL);
 		dde->dde_next = NULL;
 		dde->dde_chksum = *cs;
 		dde->dde_prop = prop;
 		dde->dde_ref = *dr;
 		*ddepp = dde;
 		ddt->cur_ddt_size += sizeof (dedup_entry_t);
 		ddt->ddt_count++;
 	}
 }
 
 /*
  * Using the specified dedup table, do a lookup for an entry with
  * the checksum cs.  If found, return the block's reference info
  * in *dr. Otherwise, insert a new entry in the dedup table, using
  * the reference information specified by *dr.
  *
  * return value:  true - entry was found
  *		  false - entry was not found
  */
 static boolean_t
 ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs,
     uint64_t prop, dataref_t *dr)
 {
 	uint32_t hashcode;
 	dedup_entry_t **ddepp;
 
 	hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits);
 
 	for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL;
 	    ddepp = &((*ddepp)->dde_next)) {
 		if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) &&
 		    (*ddepp)->dde_prop == prop) {
 			*dr = (*ddepp)->dde_ref;
 			return (B_TRUE);
 		}
 	}
 	ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr);
 	return (B_FALSE);
 }
 
 static int
 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
     zio_cksum_t *zc, int outfd)
 {
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	fletcher_4_incremental_native(drr,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
 	if (drr->drr_type != DRR_BEGIN) {
 		ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
 		    drr_checksum.drr_checksum));
 		drr->drr_u.drr_checksum.drr_checksum = *zc;
 	}
 	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
 	    sizeof (zio_cksum_t), zc);
 	if (write(outfd, drr, sizeof (*drr)) == -1)
 		return (errno);
 	if (payload_len != 0) {
 		fletcher_4_incremental_native(payload, payload_len, zc);
 		if (write(outfd, payload, payload_len) == -1)
 			return (errno);
 	}
 	return (0);
 }
 
 /*
  * This function is started in a separate thread when the dedup option
  * has been requested.  The main send thread determines the list of
  * snapshots to be included in the send stream and makes the ioctl calls
  * for each one.  But instead of having the ioctl send the output to the
  * the output fd specified by the caller of zfs_send()), the
  * ioctl is told to direct the output to a pipe, which is read by the
  * alternate thread running THIS function.  This function does the
  * dedup'ing by:
  *  1. building a dedup table (the DDT)
  *  2. doing checksums on each data block and inserting a record in the DDT
  *  3. looking for matching checksums, and
  *  4.  sending a DRR_WRITE_BYREF record instead of a write record whenever
  *      a duplicate block is found.
  * The output of this function then goes to the output fd requested
  * by the caller of zfs_send().
  */
 static void *
 cksummer(void *arg)
 {
 	dedup_arg_t *dda = arg;
 	char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
 	dmu_replay_record_t thedrr;
 	dmu_replay_record_t *drr = &thedrr;
 	FILE *ofp;
 	int outfd;
 	dedup_table_t ddt;
 	zio_cksum_t stream_cksum;
 	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
 	uint64_t numbuckets;
 
 	ddt.max_ddt_size =
 	    MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100,
 	    SMALLEST_POSSIBLE_MAX_DDT_MB << 20);
 
 	numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t));
 
 	/*
 	 * numbuckets must be a power of 2.  Increase number to
 	 * a power of 2 if necessary.
 	 */
 	if (!ISP2(numbuckets))
 		numbuckets = 1 << high_order_bit(numbuckets);
 
 	ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *));
 	ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0,
 	    NULL, NULL, NULL, NULL, NULL, 0);
 	ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *);
 	ddt.numhashbits = high_order_bit(numbuckets) - 1;
 	ddt.ddt_full = B_FALSE;
 
 	outfd = dda->outputfd;
 	ofp = fdopen(dda->inputfd, "r");
 	while (ssread(drr, sizeof (*drr), ofp) != 0) {
 
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
 		{
 			struct drr_begin *drrb = &drr->drr_u.drr_begin;
 			int fflags;
 			int sz = 0;
 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
 
 			ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 
 			/* set the DEDUP feature flag for this stream */
 			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 			fflags |= (DMU_BACKUP_FEATURE_DEDUP |
 			    DMU_BACKUP_FEATURE_DEDUPPROPS);
 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
 
 			if (drr->drr_payloadlen != 0) {
 				sz = drr->drr_payloadlen;
 
 				if (sz > SPA_MAXBLOCKSIZE) {
 					buf = zfs_realloc(dda->dedup_hdl, buf,
 					    SPA_MAXBLOCKSIZE, sz);
 				}
 				(void) ssread(buf, sz, ofp);
 				if (ferror(stdin))
 					perror("fread");
 			}
 			if (dump_record(drr, buf, sz, &stream_cksum,
 			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_END:
 		{
 			struct drr_end *drre = &drr->drr_u.drr_end;
 			/* use the recalculated checksum */
 			drre->drr_checksum = stream_cksum;
 			if (dump_record(drr, NULL, 0, &stream_cksum,
 			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_OBJECT:
 		{
 			struct drr_object *drro = &drr->drr_u.drr_object;
 			if (drro->drr_bonuslen > 0) {
 				(void) ssread(buf,
 				    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
 				    ofp);
 			}
 			if (dump_record(drr, buf,
 			    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
 			    &stream_cksum, outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_SPILL:
 		{
 			struct drr_spill *drrs = &drr->drr_u.drr_spill;
 			(void) ssread(buf, drrs->drr_length, ofp);
 			if (dump_record(drr, buf, drrs->drr_length,
 			    &stream_cksum, outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_FREEOBJECTS:
 		{
 			if (dump_record(drr, NULL, 0, &stream_cksum,
 			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_WRITE:
 		{
 			struct drr_write *drrw = &drr->drr_u.drr_write;
 			dataref_t	dataref;
 
 			(void) ssread(buf, drrw->drr_length, ofp);
 
 			/*
 			 * Use the existing checksum if it's dedup-capable,
 			 * else calculate a SHA256 checksum for it.
 			 */
 
 			if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
 			    zero_cksum) ||
 			    !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
 				SHA256_CTX	ctx;
 				zio_cksum_t	tmpsha256;
 
 				SHA256Init(&ctx);
 				SHA256Update(&ctx, buf, drrw->drr_length);
 				SHA256Final(&tmpsha256, &ctx);
 				drrw->drr_key.ddk_cksum.zc_word[0] =
 				    BE_64(tmpsha256.zc_word[0]);
 				drrw->drr_key.ddk_cksum.zc_word[1] =
 				    BE_64(tmpsha256.zc_word[1]);
 				drrw->drr_key.ddk_cksum.zc_word[2] =
 				    BE_64(tmpsha256.zc_word[2]);
 				drrw->drr_key.ddk_cksum.zc_word[3] =
 				    BE_64(tmpsha256.zc_word[3]);
 				drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256;
 				drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP;
 			}
 
 			dataref.ref_guid = drrw->drr_toguid;
 			dataref.ref_object = drrw->drr_object;
 			dataref.ref_offset = drrw->drr_offset;
 
 			if (ddt_update(dda->dedup_hdl, &ddt,
 			    &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop,
 			    &dataref)) {
 				dmu_replay_record_t wbr_drr = {0};
 				struct drr_write_byref *wbr_drrr =
 				    &wbr_drr.drr_u.drr_write_byref;
 
 				/* block already present in stream */
 				wbr_drr.drr_type = DRR_WRITE_BYREF;
 
 				wbr_drrr->drr_object = drrw->drr_object;
 				wbr_drrr->drr_offset = drrw->drr_offset;
 				wbr_drrr->drr_length = drrw->drr_length;
 				wbr_drrr->drr_toguid = drrw->drr_toguid;
 				wbr_drrr->drr_refguid = dataref.ref_guid;
 				wbr_drrr->drr_refobject =
 				    dataref.ref_object;
 				wbr_drrr->drr_refoffset =
 				    dataref.ref_offset;
 
 				wbr_drrr->drr_checksumtype =
 				    drrw->drr_checksumtype;
 				wbr_drrr->drr_checksumflags =
 				    drrw->drr_checksumtype;
 				wbr_drrr->drr_key.ddk_cksum =
 				    drrw->drr_key.ddk_cksum;
 				wbr_drrr->drr_key.ddk_prop =
 				    drrw->drr_key.ddk_prop;
 
 				if (dump_record(&wbr_drr, NULL, 0,
 				    &stream_cksum, outfd) != 0)
 					goto out;
 			} else {
 				/* block not previously seen */
 				if (dump_record(drr, buf, drrw->drr_length,
 				    &stream_cksum, outfd) != 0)
 					goto out;
 			}
 			break;
 		}
 
 		case DRR_WRITE_EMBEDDED:
 		{
 			struct drr_write_embedded *drrwe =
 			    &drr->drr_u.drr_write_embedded;
 			(void) ssread(buf,
 			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp);
 			if (dump_record(drr, buf,
 			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8),
 			    &stream_cksum, outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_FREE:
 		{
 			if (dump_record(drr, NULL, 0, &stream_cksum,
 			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		default:
 			(void) fprintf(stderr, "INVALID record type 0x%x\n",
 			    drr->drr_type);
 			/* should never happen, so assert */
 			assert(B_FALSE);
 		}
 	}
 out:
 	umem_cache_destroy(ddt.ddecache);
 	free(ddt.dedup_hash_array);
 	free(buf);
 	(void) fclose(ofp);
 
 	return (NULL);
 }
 
 /*
  * Routines for dealing with the AVL tree of fs-nvlists
  */
 typedef struct fsavl_node {
 	avl_node_t fn_node;
 	nvlist_t *fn_nvfs;
 	char *fn_snapname;
 	uint64_t fn_guid;
 } fsavl_node_t;
 
 static int
 fsavl_compare(const void *arg1, const void *arg2)
 {
 	const fsavl_node_t *fn1 = arg1;
 	const fsavl_node_t *fn2 = arg2;
 
 	if (fn1->fn_guid > fn2->fn_guid)
 		return (+1);
 	else if (fn1->fn_guid < fn2->fn_guid)
 		return (-1);
 	else
 		return (0);
 }
 
 /*
  * Given the GUID of a snapshot, find its containing filesystem and
  * (optionally) name.
  */
 static nvlist_t *
 fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname)
 {
 	fsavl_node_t fn_find;
 	fsavl_node_t *fn;
 
 	fn_find.fn_guid = snapguid;
 
 	fn = avl_find(avl, &fn_find, NULL);
 	if (fn) {
 		if (snapname)
 			*snapname = fn->fn_snapname;
 		return (fn->fn_nvfs);
 	}
 	return (NULL);
 }
 
 static void
 fsavl_destroy(avl_tree_t *avl)
 {
 	fsavl_node_t *fn;
 	void *cookie;
 
 	if (avl == NULL)
 		return;
 
 	cookie = NULL;
 	while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL)
 		free(fn);
 	avl_destroy(avl);
 	free(avl);
 }
 
 /*
  * Given an nvlist, produce an avl tree of snapshots, ordered by guid
  */
 static avl_tree_t *
 fsavl_create(nvlist_t *fss)
 {
 	avl_tree_t *fsavl;
 	nvpair_t *fselem = NULL;
 
 	if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL)
 		return (NULL);
 
 	avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t),
 	    offsetof(fsavl_node_t, fn_node));
 
 	while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) {
 		nvlist_t *nvfs, *snaps;
 		nvpair_t *snapelem = NULL;
 
 		VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
 		VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
 
 		while ((snapelem =
 		    nvlist_next_nvpair(snaps, snapelem)) != NULL) {
 			fsavl_node_t *fn;
 			uint64_t guid;
 
 			VERIFY(0 == nvpair_value_uint64(snapelem, &guid));
 			if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) {
 				fsavl_destroy(fsavl);
 				return (NULL);
 			}
 			fn->fn_nvfs = nvfs;
 			fn->fn_snapname = nvpair_name(snapelem);
 			fn->fn_guid = guid;
 
 			/*
 			 * Note: if there are multiple snaps with the
 			 * same GUID, we ignore all but one.
 			 */
 			if (avl_find(fsavl, fn, NULL) == NULL)
 				avl_add(fsavl, fn);
 			else
 				free(fn);
 		}
 	}
 
 	return (fsavl);
 }
 
 /*
  * Routines for dealing with the giant nvlist of fs-nvlists, etc.
  */
 typedef struct send_data {
 	uint64_t parent_fromsnap_guid;
 	nvlist_t *parent_snaps;
 	nvlist_t *fss;
 	nvlist_t *snapprops;
 	const char *fromsnap;
 	const char *tosnap;
 	boolean_t recursive;
 
 	/*
 	 * The header nvlist is of the following format:
 	 * {
 	 *   "tosnap" -> string
 	 *   "fromsnap" -> string (if incremental)
 	 *   "fss" -> {
 	 *	id -> {
 	 *
 	 *	 "name" -> string (full name; for debugging)
 	 *	 "parentfromsnap" -> number (guid of fromsnap in parent)
 	 *
 	 *	 "props" -> { name -> value (only if set here) }
 	 *	 "snaps" -> { name (lastname) -> number (guid) }
 	 *	 "snapprops" -> { name (lastname) -> { name -> value } }
 	 *
 	 *	 "origin" -> number (guid) (if clone)
 	 *	 "sent" -> boolean (not on-disk)
 	 *	}
 	 *   }
 	 * }
 	 *
 	 */
 } send_data_t;
 
 static void send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv);
 
 static int
 send_iterate_snap(zfs_handle_t *zhp, void *arg)
 {
 	send_data_t *sd = arg;
 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
 	char *snapname;
 	nvlist_t *nv;
 
 	snapname = strrchr(zhp->zfs_name, '@')+1;
 
 	VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid));
 	/*
 	 * NB: if there is no fromsnap here (it's a newly created fs in
 	 * an incremental replication), we will substitute the tosnap.
 	 */
 	if ((sd->fromsnap && strcmp(snapname, sd->fromsnap) == 0) ||
 	    (sd->parent_fromsnap_guid == 0 && sd->tosnap &&
 	    strcmp(snapname, sd->tosnap) == 0)) {
 		sd->parent_fromsnap_guid = guid;
 	}
 
 	VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0));
 	send_iterate_prop(zhp, nv);
 	VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv));
 	nvlist_free(nv);
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static void
 send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
 {
 	nvpair_t *elem = NULL;
 
 	while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) {
 		char *propname = nvpair_name(elem);
 		zfs_prop_t prop = zfs_name_to_prop(propname);
 		nvlist_t *propnv;
 
 		if (!zfs_prop_user(propname)) {
 			/*
 			 * Realistically, this should never happen.  However,
 			 * we want the ability to add DSL properties without
 			 * needing to make incompatible version changes.  We
 			 * need to ignore unknown properties to allow older
 			 * software to still send datasets containing these
 			 * properties, with the unknown properties elided.
 			 */
 			if (prop == ZPROP_INVAL)
 				continue;
 
 			if (zfs_prop_readonly(prop))
 				continue;
 		}
 
 		verify(nvpair_value_nvlist(elem, &propnv) == 0);
 		if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION ||
 		    prop == ZFS_PROP_REFQUOTA ||
 		    prop == ZFS_PROP_REFRESERVATION) {
 			char *source;
 			uint64_t value;
 			verify(nvlist_lookup_uint64(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
 				continue;
 			/*
 			 * May have no source before SPA_VERSION_RECVD_PROPS,
 			 * but is still modifiable.
 			 */
 			if (nvlist_lookup_string(propnv,
 			    ZPROP_SOURCE, &source) == 0) {
 				if ((strcmp(source, zhp->zfs_name) != 0) &&
 				    (strcmp(source,
 				    ZPROP_SOURCE_VAL_RECVD) != 0))
 					continue;
 			}
 		} else {
 			char *source;
 			if (nvlist_lookup_string(propnv,
 			    ZPROP_SOURCE, &source) != 0)
 				continue;
 			if ((strcmp(source, zhp->zfs_name) != 0) &&
 			    (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0))
 				continue;
 		}
 
 		if (zfs_prop_user(propname) ||
 		    zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
 			char *value;
 			verify(nvlist_lookup_string(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			VERIFY(0 == nvlist_add_string(nv, propname, value));
 		} else {
 			uint64_t value;
 			verify(nvlist_lookup_uint64(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			VERIFY(0 == nvlist_add_uint64(nv, propname, value));
 		}
 	}
 }
 
 /*
  * recursively generate nvlists describing datasets.  See comment
  * for the data structure send_data_t above for description of contents
  * of the nvlist.
  */
 static int
 send_iterate_fs(zfs_handle_t *zhp, void *arg)
 {
 	send_data_t *sd = arg;
 	nvlist_t *nvfs, *nv;
 	int rv = 0;
 	uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid;
 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
 	char guidstring[64];
 
 	VERIFY(0 == nvlist_alloc(&nvfs, NV_UNIQUE_NAME, 0));
 	VERIFY(0 == nvlist_add_string(nvfs, "name", zhp->zfs_name));
 	VERIFY(0 == nvlist_add_uint64(nvfs, "parentfromsnap",
 	    sd->parent_fromsnap_guid));
 
 	if (zhp->zfs_dmustats.dds_origin[0]) {
 		zfs_handle_t *origin = zfs_open(zhp->zfs_hdl,
 		    zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
 		if (origin == NULL)
 			return (-1);
 		VERIFY(0 == nvlist_add_uint64(nvfs, "origin",
 		    origin->zfs_dmustats.dds_guid));
 	}
 
 	/* iterate over props */
 	VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0));
 	send_iterate_prop(zhp, nv);
 	VERIFY(0 == nvlist_add_nvlist(nvfs, "props", nv));
 	nvlist_free(nv);
 
 	/* iterate over snaps, and set sd->parent_fromsnap_guid */
 	sd->parent_fromsnap_guid = 0;
 	VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0));
 	VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0));
 	(void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd);
 	VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps));
 	VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops));
 	nvlist_free(sd->parent_snaps);
 	nvlist_free(sd->snapprops);
 
 	/* add this fs to nvlist */
 	(void) snprintf(guidstring, sizeof (guidstring),
 	    "0x%llx", (longlong_t)guid);
 	VERIFY(0 == nvlist_add_nvlist(sd->fss, guidstring, nvfs));
 	nvlist_free(nvfs);
 
 	/* iterate over children */
 	if (sd->recursive)
 		rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
 
 	sd->parent_fromsnap_guid = parent_fromsnap_guid_save;
 
 	zfs_close(zhp);
 	return (rv);
 }
 
 static int
 gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
     const char *tosnap, boolean_t recursive, nvlist_t **nvlp, avl_tree_t **avlp)
 {
 	zfs_handle_t *zhp;
 	send_data_t sd = { 0 };
 	int error;
 
 	zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return (EZFS_BADTYPE);
 
 	VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0));
 	sd.fromsnap = fromsnap;
 	sd.tosnap = tosnap;
 	sd.recursive = recursive;
 
 	if ((error = send_iterate_fs(zhp, &sd)) != 0) {
 		nvlist_free(sd.fss);
 		if (avlp != NULL)
 			*avlp = NULL;
 		*nvlp = NULL;
 		return (error);
 	}
 
 	if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) {
 		nvlist_free(sd.fss);
 		*nvlp = NULL;
 		return (EZFS_NOMEM);
 	}
 
 	*nvlp = sd.fss;
 	return (0);
 }
 
 /*
  * Routines specific to "zfs send"
  */
 typedef struct send_dump_data {
 	/* these are all just the short snapname (the part after the @) */
 	const char *fromsnap;
 	const char *tosnap;
 	char prevsnap[ZFS_MAXNAMELEN];
 	uint64_t prevsnap_obj;
 	boolean_t seenfrom, seento, replicate, doall, fromorigin;
 	boolean_t verbose, dryrun, parsable, progress, embed_data, std_out;
 	boolean_t large_block;
 	int outfd;
 	boolean_t err;
 	nvlist_t *fss;
 	nvlist_t *snapholds;
 	avl_tree_t *fsavl;
 	snapfilter_cb_t *filter_cb;
 	void *filter_cb_arg;
 	nvlist_t *debugnv;
 	char holdtag[ZFS_MAXNAMELEN];
 	int cleanup_fd;
 	uint64_t size;
 } send_dump_data_t;
 
 static int
 estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
     boolean_t fromorigin, uint64_t *sizep)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 	assert(fromsnap_obj == 0 || !fromorigin);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	zc.zc_obj = fromorigin;
 	zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 	zc.zc_fromobj = fromsnap_obj;
 	zc.zc_guid = 1;  /* estimate flag */
 
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot estimate space for '%s'"), zhp->zfs_name);
 
 		switch (errno) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
 			if (zfs_dataset_exists(hdl, zc.zc_name,
 			    ZFS_TYPE_SNAPSHOT)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source (@%s) does not exist"),
 				    zc.zc_value);
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	*sizep = zc.zc_objset_type;
 
 	return (0);
 }
 
 /*
  * Dumps a backup of the given snapshot (incremental from fromsnap if it's not
  * NULL) to the file descriptor specified by outfd.
  */
 static int
 dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
     boolean_t fromorigin, int outfd, enum lzc_send_flags flags,
     nvlist_t *debugnv)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *thisdbg;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 	assert(fromsnap_obj == 0 || !fromorigin);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	zc.zc_cookie = outfd;
 	zc.zc_obj = fromorigin;
 	zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 	zc.zc_fromobj = fromsnap_obj;
 	zc.zc_flags = flags;
 
 	VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
 	if (fromsnap && fromsnap[0] != '\0') {
 		VERIFY(0 == nvlist_add_string(thisdbg,
 		    "fromsnap", fromsnap));
 	}
 
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot send '%s'"), zhp->zfs_name);
 
 		VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno));
 		if (debugnv) {
 			VERIFY(0 == nvlist_add_nvlist(debugnv,
 			    zhp->zfs_name, thisdbg));
 		}
 		nvlist_free(thisdbg);
 
 		switch (errno) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
 			if (zfs_dataset_exists(hdl, zc.zc_name,
 			    ZFS_TYPE_SNAPSHOT)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source (@%s) does not exist"),
 				    zc.zc_value);
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 #ifdef illumos
 		case ENOSTR:
 #endif
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	if (debugnv)
 		VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
 	nvlist_free(thisdbg);
 
 	return (0);
 }
 
 static void
 gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd)
 {
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
 	/*
 	 * zfs_send() only sets snapholds for sends that need them,
 	 * e.g. replication and doall.
 	 */
 	if (sdd->snapholds == NULL)
 		return;
 
 	fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag);
 }
 
 static void *
 send_progress_thread(void *arg)
 {
 	progress_arg_t *pa = arg;
 	zfs_cmd_t zc = { 0 };
 	zfs_handle_t *zhp = pa->pa_zhp;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	unsigned long long bytes;
 	char buf[16];
 	time_t t;
 	struct tm *tm;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (!pa->pa_parsable)
 		(void) fprintf(stderr, "TIME        SENT   SNAPSHOT\n");
 
 	/*
 	 * Print the progress from ZFS_IOC_SEND_PROGRESS every second.
 	 */
 	for (;;) {
 		(void) sleep(1);
 
 		zc.zc_cookie = pa->pa_fd;
 		if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0)
 			return ((void *)-1);
 
 		(void) time(&t);
 		tm = localtime(&t);
 		bytes = zc.zc_cookie;
 
 		if (pa->pa_parsable) {
 			(void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
 			    tm->tm_hour, tm->tm_min, tm->tm_sec,
 			    bytes, zhp->zfs_name);
 		} else {
 			zfs_nicenum(bytes, buf, sizeof (buf));
 			(void) fprintf(stderr, "%02d:%02d:%02d   %5s   %s\n",
 			    tm->tm_hour, tm->tm_min, tm->tm_sec,
 			    buf, zhp->zfs_name);
 		}
 	}
 }
 
 static void
 send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap,
     uint64_t size, boolean_t parsable)
 {
 	if (parsable) {
 		if (fromsnap != NULL) {
 			(void) fprintf(fout, "incremental\t%s\t%s",
 			    fromsnap, tosnap);
 		} else {
 			(void) fprintf(fout, "full\t%s",
 			    tosnap);
 		}
 	} else {
 		if (fromsnap != NULL) {
 			if (strchr(fromsnap, '@') == NULL &&
 			    strchr(fromsnap, '#') == NULL) {
 				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 				    "send from @%s to %s"),
 				    fromsnap, tosnap);
 			} else {
 				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 				    "send from %s to %s"),
 				    fromsnap, tosnap);
 			}
 		} else {
 			(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 			    "full send of %s"),
 			    tosnap);
 		}
 	}
 
 	if (size != 0) {
 		if (parsable) {
 			(void) fprintf(fout, "\t%llu",
 			    (longlong_t)size);
 		} else {
 			char buf[16];
 			zfs_nicenum(size, buf, sizeof (buf));
 			(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 			    " estimated size is %s"), buf);
 		}
 	}
 	(void) fprintf(fout, "\n");
 }
 
 static int
 dump_snapshot(zfs_handle_t *zhp, void *arg)
 {
 	send_dump_data_t *sdd = arg;
 	progress_arg_t pa = { 0 };
 	pthread_t tid;
 	char *thissnap;
 	int err;
 	boolean_t isfromsnap, istosnap, fromorigin;
 	boolean_t exclude = B_FALSE;
 	FILE *fout = sdd->std_out ? stdout : stderr;
 
 	err = 0;
 	thissnap = strchr(zhp->zfs_name, '@') + 1;
 	isfromsnap = (sdd->fromsnap != NULL &&
 	    strcmp(sdd->fromsnap, thissnap) == 0);
 
 	if (!sdd->seenfrom && isfromsnap) {
 		gather_holds(zhp, sdd);
 		sdd->seenfrom = B_TRUE;
 		(void) strcpy(sdd->prevsnap, thissnap);
 		sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (sdd->seento || !sdd->seenfrom) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
 	if (istosnap)
 		sdd->seento = B_TRUE;
 
 	if (!sdd->doall && !isfromsnap && !istosnap) {
 		if (sdd->replicate) {
 			char *snapname;
 			nvlist_t *snapprops;
 			/*
 			 * Filter out all intermediate snapshots except origin
 			 * snapshots needed to replicate clones.
 			 */
 			nvlist_t *nvfs = fsavl_find(sdd->fsavl,
 			    zhp->zfs_dmustats.dds_guid, &snapname);
 
 			VERIFY(0 == nvlist_lookup_nvlist(nvfs,
 			    "snapprops", &snapprops));
 			VERIFY(0 == nvlist_lookup_nvlist(snapprops,
 			    thissnap, &snapprops));
 			exclude = !nvlist_exists(snapprops, "is_clone_origin");
 		} else {
 			exclude = B_TRUE;
 		}
 	}
 
 	/*
 	 * If a filter function exists, call it to determine whether
 	 * this snapshot will be sent.
 	 */
 	if (exclude || (sdd->filter_cb != NULL &&
 	    sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
 		/*
 		 * This snapshot is filtered out.  Don't send it, and don't
 		 * set prevsnap_obj, so it will be as if this snapshot didn't
 		 * exist, and the next accepted snapshot will be sent as
 		 * an incremental from the last accepted one, or as the
 		 * first (and full) snapshot in the case of a replication,
 		 * non-incremental send.
 		 */
 		zfs_close(zhp);
 		return (0);
 	}
 
 	gather_holds(zhp, sdd);
 	fromorigin = sdd->prevsnap[0] == '\0' &&
 	    (sdd->fromorigin || sdd->replicate);
 
 	if (sdd->verbose) {
 		uint64_t size = 0;
 		(void) estimate_ioctl(zhp, sdd->prevsnap_obj,
 		    fromorigin, &size);
 
 		send_print_verbose(fout, zhp->zfs_name,
 		    sdd->prevsnap[0] ? sdd->prevsnap : NULL,
 		    size, sdd->parsable);
 		sdd->size += size;
 	}
 
 	if (!sdd->dryrun) {
 		/*
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
 		if (sdd->progress) {
 			pa.pa_zhp = zhp;
 			pa.pa_fd = sdd->outfd;
 			pa.pa_parsable = sdd->parsable;
 
 			if (err = pthread_create(&tid, NULL,
 			    send_progress_thread, &pa)) {
 				zfs_close(zhp);
 				return (err);
 			}
 		}
 
 		enum lzc_send_flags flags = 0;
 		if (sdd->large_block)
 			flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 		if (sdd->embed_data)
 			flags |= LZC_SEND_FLAG_EMBED_DATA;
 
 		err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
 		    fromorigin, sdd->outfd, flags, sdd->debugnv);
 
 		if (sdd->progress) {
 			(void) pthread_cancel(tid);
 			(void) pthread_join(tid, NULL);
 		}
 	}
 
 	(void) strcpy(sdd->prevsnap, thissnap);
 	sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 	zfs_close(zhp);
 	return (err);
 }
 
 static int
 dump_filesystem(zfs_handle_t *zhp, void *arg)
 {
 	int rv = 0;
 	send_dump_data_t *sdd = arg;
 	boolean_t missingfrom = B_FALSE;
 	zfs_cmd_t zc = { 0 };
 
 	(void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
 	    zhp->zfs_name, sdd->tosnap);
 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "WARNING: could not send %s@%s: does not exist\n"),
 		    zhp->zfs_name, sdd->tosnap);
 		sdd->err = B_TRUE;
 		return (0);
 	}
 
 	if (sdd->replicate && sdd->fromsnap) {
 		/*
 		 * If this fs does not have fromsnap, and we're doing
 		 * recursive, we need to send a full stream from the
 		 * beginning (or an incremental from the origin if this
 		 * is a clone).  If we're doing non-recursive, then let
 		 * them get the error.
 		 */
 		(void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
 		    zhp->zfs_name, sdd->fromsnap);
 		if (ioctl(zhp->zfs_hdl->libzfs_fd,
 		    ZFS_IOC_OBJSET_STATS, &zc) != 0) {
 			missingfrom = B_TRUE;
 		}
 	}
 
 	sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
 	sdd->prevsnap_obj = 0;
 	if (sdd->fromsnap == NULL || missingfrom)
 		sdd->seenfrom = B_TRUE;
 
 	rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg);
 	if (!sdd->seenfrom) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "WARNING: could not send %s@%s:\n"
 		    "incremental source (%s@%s) does not exist\n"),
 		    zhp->zfs_name, sdd->tosnap,
 		    zhp->zfs_name, sdd->fromsnap);
 		sdd->err = B_TRUE;
 	} else if (!sdd->seento) {
 		if (sdd->fromsnap) {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "WARNING: could not send %s@%s:\n"
 			    "incremental source (%s@%s) "
 			    "is not earlier than it\n"),
 			    zhp->zfs_name, sdd->tosnap,
 			    zhp->zfs_name, sdd->fromsnap);
 		} else {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "WARNING: "
 			    "could not send %s@%s: does not exist\n"),
 			    zhp->zfs_name, sdd->tosnap);
 		}
 		sdd->err = B_TRUE;
 	}
 
 	return (rv);
 }
 
 static int
 dump_filesystems(zfs_handle_t *rzhp, void *arg)
 {
 	send_dump_data_t *sdd = arg;
 	nvpair_t *fspair;
 	boolean_t needagain, progress;
 
 	if (!sdd->replicate)
 		return (dump_filesystem(rzhp, sdd));
 
 	/* Mark the clone origin snapshots. */
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
 		nvlist_t *nvfs;
 		uint64_t origin_guid = 0;
 
 		VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs));
 		(void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid);
 		if (origin_guid != 0) {
 			char *snapname;
 			nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
 			    origin_guid, &snapname);
 			if (origin_nv != NULL) {
 				nvlist_t *snapprops;
 				VERIFY(0 == nvlist_lookup_nvlist(origin_nv,
 				    "snapprops", &snapprops));
 				VERIFY(0 == nvlist_lookup_nvlist(snapprops,
 				    snapname, &snapprops));
 				VERIFY(0 == nvlist_add_boolean(
 				    snapprops, "is_clone_origin"));
 			}
 		}
 	}
 again:
 	needagain = progress = B_FALSE;
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
 		nvlist_t *fslist, *parent_nv;
 		char *fsname;
 		zfs_handle_t *zhp;
 		int err;
 		uint64_t origin_guid = 0;
 		uint64_t parent_guid = 0;
 
 		VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
 		if (nvlist_lookup_boolean(fslist, "sent") == 0)
 			continue;
 
 		VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0);
 		(void) nvlist_lookup_uint64(fslist, "origin", &origin_guid);
 		(void) nvlist_lookup_uint64(fslist, "parentfromsnap",
 		    &parent_guid);
 
 		if (parent_guid != 0) {
 			parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL);
 			if (!nvlist_exists(parent_nv, "sent")) {
 				/* parent has not been sent; skip this one */
 				needagain = B_TRUE;
 				continue;
 			}
 		}
 
 		if (origin_guid != 0) {
 			nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
 			    origin_guid, NULL);
 			if (origin_nv != NULL &&
 			    !nvlist_exists(origin_nv, "sent")) {
 				/*
 				 * origin has not been sent yet;
 				 * skip this clone.
 				 */
 				needagain = B_TRUE;
 				continue;
 			}
 		}
 
 		zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET);
 		if (zhp == NULL)
 			return (-1);
 		err = dump_filesystem(zhp, sdd);
 		VERIFY(nvlist_add_boolean(fslist, "sent") == 0);
 		progress = B_TRUE;
 		zfs_close(zhp);
 		if (err)
 			return (err);
 	}
 	if (needagain) {
 		assert(progress);
 		goto again;
 	}
 
 	/* clean out the sent flags in case we reuse this fss */
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
 		nvlist_t *fslist;
 
 		VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
 		(void) nvlist_remove_all(fslist, "sent");
 	}
 
 	return (0);
 }
 
 nvlist_t *
 zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token)
 {
 	unsigned int version;
 	int nread;
 	unsigned long long checksum, packed_len;
 
 	/*
 	 * Decode token header, which is:
 	 *   <token version>-<checksum of payload>-<uncompressed payload length>
 	 * Note that the only supported token version is 1.
 	 */
 	nread = sscanf(token, "%u-%llx-%llx-",
 	    &version, &checksum, &packed_len);
 	if (nread != 3) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (invalid format)"));
 		return (NULL);
 	}
 
 	if (version != ZFS_SEND_RESUME_TOKEN_VERSION) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (invalid version %u)"),
 		    version);
 		return (NULL);
 	}
 
 	/* convert hexadecimal representation to binary */
 	token = strrchr(token, '-') + 1;
 	int len = strlen(token) / 2;
 	unsigned char *compressed = zfs_alloc(hdl, len);
 	for (int i = 0; i < len; i++) {
 		nread = sscanf(token + i * 2, "%2hhx", compressed + i);
 		if (nread != 1) {
 			free(compressed);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "resume token is corrupt "
 			    "(payload is not hex-encoded)"));
 			return (NULL);
 		}
 	}
 
 	/* verify checksum */
 	zio_cksum_t cksum;
 	fletcher_4_native(compressed, len, NULL, &cksum);
 	if (cksum.zc_word[0] != checksum) {
 		free(compressed);
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (incorrect checksum)"));
 		return (NULL);
 	}
 
 	/* uncompress */
 	void *packed = zfs_alloc(hdl, packed_len);
 	uLongf packed_len_long = packed_len;
 	if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK ||
 	    packed_len_long != packed_len) {
 		free(packed);
 		free(compressed);
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (decompression failed)"));
 		return (NULL);
 	}
 
 	/* unpack nvlist */
 	nvlist_t *nv;
 	int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP);
 	free(packed);
 	free(compressed);
 	if (error != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (nvlist_unpack failed)"));
 		return (NULL);
 	}
 	return (nv);
 }
 
 int
 zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
     const char *resume_token)
 {
 	char errbuf[1024];
 	char *toname;
 	char *fromname = NULL;
 	uint64_t resumeobj, resumeoff, toguid, fromguid, bytes;
 	zfs_handle_t *zhp;
 	int error = 0;
 	char name[ZFS_MAXNAMELEN];
 	enum lzc_send_flags lzc_flags = 0;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot resume send"));
 
 	nvlist_t *resume_nvl =
 	    zfs_send_resume_token_to_nvlist(hdl, resume_token);
 	if (resume_nvl == NULL) {
 		/*
 		 * zfs_error_aux has already been set by
 		 * zfs_send_resume_token_to_nvlist
 		 */
 		return (zfs_error(hdl, EZFS_FAULT, errbuf));
 	}
 	if (flags->verbose) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "resume token contents:\n"));
 		nvlist_print(stderr, resume_nvl);
 	}
 
 	if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt"));
 		return (zfs_error(hdl, EZFS_FAULT, errbuf));
 	}
 	fromguid = 0;
 	(void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
 
 	if (flags->embed_data || nvlist_exists(resume_nvl, "embedok"))
 		lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 
 	if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) {
 		if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' is no longer the same snapshot used in "
 			    "the initial send"), toname);
 		} else {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' used in the initial send no longer exists"),
 			    toname);
 		}
 		return (zfs_error(hdl, EZFS_BADPATH, errbuf));
 	}
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "unable to access '%s'"), name);
 		return (zfs_error(hdl, EZFS_BADPATH, errbuf));
 	}
 
 	if (fromguid != 0) {
 		if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "incremental source %#llx no longer exists"),
 			    (longlong_t)fromguid);
 			return (zfs_error(hdl, EZFS_BADPATH, errbuf));
 		}
 		fromname = name;
 	}
 
 	if (flags->verbose) {
 		uint64_t size = 0;
 		error = lzc_send_space(zhp->zfs_name, fromname, &size);
 		if (error == 0)
 			size = MAX(0, (int64_t)(size - bytes));
 		send_print_verbose(stderr, zhp->zfs_name, fromname,
 		    size, flags->parsable);
 	}
 
 	if (!flags->dryrun) {
 		progress_arg_t pa = { 0 };
 		pthread_t tid;
 		/*
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
 		if (flags->progress) {
 			pa.pa_zhp = zhp;
 			pa.pa_fd = outfd;
 			pa.pa_parsable = flags->parsable;
 
 			error = pthread_create(&tid, NULL,
 			    send_progress_thread, &pa);
 			if (error != 0) {
 				zfs_close(zhp);
 				return (error);
 			}
 		}
 
 		error = lzc_send_resume(zhp->zfs_name, fromname, outfd,
 		    lzc_flags, resumeobj, resumeoff);
 
 		if (flags->progress) {
 			(void) pthread_cancel(tid);
 			(void) pthread_join(tid, NULL);
 		}
 
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot send '%s'"), zhp->zfs_name);
 
 		zfs_close(zhp);
 
 		switch (error) {
 		case 0:
 			return (0);
 		case EXDEV:
 		case ENOENT:
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 #ifdef illumos
 		case ENOSTR:
 #endif
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 
 	zfs_close(zhp);
 
 	return (error);
 }
 
 /*
  * Generate a send stream for the dataset identified by the argument zhp.
  *
  * The content of the send stream is the snapshot identified by
  * 'tosnap'.  Incremental streams are requested in two ways:
  *     - from the snapshot identified by "fromsnap" (if non-null) or
  *     - from the origin of the dataset identified by zhp, which must
  *	 be a clone.  In this case, "fromsnap" is null and "fromorigin"
  *	 is TRUE.
  *
  * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
  * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM)
  * if "replicate" is set.  If "doall" is set, dump all the intermediate
  * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall"
  * case too. If "props" is set, send properties.
  */
 int
 zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
     sendflags_t *flags, int outfd, snapfilter_cb_t filter_func,
     void *cb_arg, nvlist_t **debugnvp)
 {
 	char errbuf[1024];
 	send_dump_data_t sdd = { 0 };
 	int err = 0;
 	nvlist_t *fss = NULL;
 	avl_tree_t *fsavl = NULL;
 	static uint64_t holdseq;
 	int spa_version;
 	pthread_t tid = 0;
 	int pipefd[2];
 	dedup_arg_t dda = { 0 };
 	int featureflags = 0;
 	FILE *fout;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot send '%s'"), zhp->zfs_name);
 
 	if (fromsnap && fromsnap[0] == '\0') {
 		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 		    "zero-length incremental source"));
 		return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
 	}
 
 	if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
 		uint64_t version;
 		version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
 		if (version >= ZPL_VERSION_SA) {
 			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
 		}
 	}
 
 	if (flags->dedup && !flags->dryrun) {
 		featureflags |= (DMU_BACKUP_FEATURE_DEDUP |
 		    DMU_BACKUP_FEATURE_DEDUPPROPS);
 		if (err = pipe(pipefd)) {
 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
 			return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED,
 			    errbuf));
 		}
 		dda.outputfd = outfd;
 		dda.inputfd = pipefd[1];
 		dda.dedup_hdl = zhp->zfs_hdl;
 		if (err = pthread_create(&tid, NULL, cksummer, &dda)) {
 			(void) close(pipefd[0]);
 			(void) close(pipefd[1]);
 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
 			return (zfs_error(zhp->zfs_hdl,
 			    EZFS_THREADCREATEFAILED, errbuf));
 		}
 	}
 
 	if (flags->replicate || flags->doall || flags->props) {
 		dmu_replay_record_t drr = { 0 };
 		char *packbuf = NULL;
 		size_t buflen = 0;
 		zio_cksum_t zc = { 0 };
 
 		if (flags->replicate || flags->props) {
 			nvlist_t *hdrnv;
 
 			VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0));
 			if (fromsnap) {
 				VERIFY(0 == nvlist_add_string(hdrnv,
 				    "fromsnap", fromsnap));
 			}
 			VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap));
 			if (!flags->replicate) {
 				VERIFY(0 == nvlist_add_boolean(hdrnv,
 				    "not_recursive"));
 			}
 
 			err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name,
 			    fromsnap, tosnap, flags->replicate, &fss, &fsavl);
 			if (err)
 				goto err_out;
 			VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
 			err = nvlist_pack(hdrnv, &packbuf, &buflen,
 			    NV_ENCODE_XDR, 0);
 			if (debugnvp)
 				*debugnvp = hdrnv;
 			else
 				nvlist_free(hdrnv);
 			if (err)
 				goto stderr_out;
 		}
 
 		if (!flags->dryrun) {
 			/* write first begin record */
 			drr.drr_type = DRR_BEGIN;
 			drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
 			DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.
 			    drr_versioninfo, DMU_COMPOUNDSTREAM);
 			DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.
 			    drr_versioninfo, featureflags);
 			(void) snprintf(drr.drr_u.drr_begin.drr_toname,
 			    sizeof (drr.drr_u.drr_begin.drr_toname),
 			    "%s@%s", zhp->zfs_name, tosnap);
 			drr.drr_payloadlen = buflen;
 
 			err = dump_record(&drr, packbuf, buflen, &zc, outfd);
 			free(packbuf);
 			if (err != 0)
 				goto stderr_out;
 
 			/* write end record */
 			bzero(&drr, sizeof (drr));
 			drr.drr_type = DRR_END;
 			drr.drr_u.drr_end.drr_checksum = zc;
 			err = write(outfd, &drr, sizeof (drr));
 			if (err == -1) {
 				err = errno;
 				goto stderr_out;
 			}
 
 			err = 0;
 		}
 	}
 
 	/* dump each stream */
 	sdd.fromsnap = fromsnap;
 	sdd.tosnap = tosnap;
 	if (tid != 0)
 		sdd.outfd = pipefd[0];
 	else
 		sdd.outfd = outfd;
 	sdd.replicate = flags->replicate;
 	sdd.doall = flags->doall;
 	sdd.fromorigin = flags->fromorigin;
 	sdd.fss = fss;
 	sdd.fsavl = fsavl;
 	sdd.verbose = flags->verbose;
 	sdd.parsable = flags->parsable;
 	sdd.progress = flags->progress;
 	sdd.dryrun = flags->dryrun;
 	sdd.large_block = flags->largeblock;
 	sdd.embed_data = flags->embed_data;
 	sdd.filter_cb = filter_func;
 	sdd.filter_cb_arg = cb_arg;
 	if (debugnvp)
 		sdd.debugnv = *debugnvp;
 	if (sdd.verbose && sdd.dryrun)
 		sdd.std_out = B_TRUE;
 	fout = sdd.std_out ? stdout : stderr;
 
 	/*
 	 * Some flags require that we place user holds on the datasets that are
 	 * being sent so they don't get destroyed during the send. We can skip
 	 * this step if the pool is imported read-only since the datasets cannot
 	 * be destroyed.
 	 */
 	if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp),
 	    ZPOOL_PROP_READONLY, NULL) &&
 	    zfs_spa_version(zhp, &spa_version) == 0 &&
 	    spa_version >= SPA_VERSION_USERREFS &&
 	    (flags->doall || flags->replicate)) {
 		++holdseq;
 		(void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
 		    ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
 		sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
 		if (sdd.cleanup_fd < 0) {
 			err = errno;
 			goto stderr_out;
 		}
 		sdd.snapholds = fnvlist_alloc();
 	} else {
 		sdd.cleanup_fd = -1;
 		sdd.snapholds = NULL;
 	}
 	if (flags->verbose || sdd.snapholds != NULL) {
 		/*
 		 * Do a verbose no-op dry run to get all the verbose output
 		 * or to gather snapshot hold's before generating any data,
 		 * then do a non-verbose real run to generate the streams.
 		 */
 		sdd.dryrun = B_TRUE;
 		err = dump_filesystems(zhp, &sdd);
 
 		if (err != 0)
 			goto stderr_out;
 
 		if (flags->verbose) {
 			if (flags->parsable) {
 				(void) fprintf(fout, "size\t%llu\n",
 				    (longlong_t)sdd.size);
 			} else {
 				char buf[16];
 				zfs_nicenum(sdd.size, buf, sizeof (buf));
 				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 				    "total estimated size is %s\n"), buf);
 			}
 		}
 
 		/* Ensure no snaps found is treated as an error. */
 		if (!sdd.seento) {
 			err = ENOENT;
 			goto err_out;
 		}
 
 		/* Skip the second run if dryrun was requested. */
 		if (flags->dryrun)
 			goto err_out;
 
 		if (sdd.snapholds != NULL) {
 			err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds);
 			if (err != 0)
 				goto stderr_out;
 
 			fnvlist_free(sdd.snapholds);
 			sdd.snapholds = NULL;
 		}
 
 		sdd.dryrun = B_FALSE;
 		sdd.verbose = B_FALSE;
 	}
 
 	err = dump_filesystems(zhp, &sdd);
 	fsavl_destroy(fsavl);
 	nvlist_free(fss);
 
 	/* Ensure no snaps found is treated as an error. */
 	if (err == 0 && !sdd.seento)
 		err = ENOENT;
 
 	if (tid != 0) {
 		if (err != 0)
 			(void) pthread_cancel(tid);
 		(void) close(pipefd[0]);
 		(void) pthread_join(tid, NULL);
 	}
 
 	if (sdd.cleanup_fd != -1) {
 		VERIFY(0 == close(sdd.cleanup_fd));
 		sdd.cleanup_fd = -1;
 	}
 
 	if (!flags->dryrun && (flags->replicate || flags->doall ||
 	    flags->props)) {
 		/*
 		 * write final end record.  NB: want to do this even if
 		 * there was some error, because it might not be totally
 		 * failed.
 		 */
 		dmu_replay_record_t drr = { 0 };
 		drr.drr_type = DRR_END;
 		if (write(outfd, &drr, sizeof (drr)) == -1) {
 			return (zfs_standard_error(zhp->zfs_hdl,
 			    errno, errbuf));
 		}
 	}
 
 	return (err || sdd.err);
 
 stderr_out:
 	err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
 err_out:
 	fsavl_destroy(fsavl);
 	nvlist_free(fss);
 	fnvlist_free(sdd.snapholds);
 
 	if (sdd.cleanup_fd != -1)
 		VERIFY(0 == close(sdd.cleanup_fd));
 	if (tid != 0) {
 		(void) pthread_cancel(tid);
 		(void) close(pipefd[0]);
 		(void) pthread_join(tid, NULL);
 	}
 	return (err);
 }
 
 int
 zfs_send_one(zfs_handle_t *zhp, const char *from, int fd,
     enum lzc_send_flags flags)
 {
 	int err;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 	char errbuf[1024];
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "warning: cannot send '%s'"), zhp->zfs_name);
 
 	err = lzc_send(zhp->zfs_name, from, fd, flags);
 	if (err != 0) {
 		switch (errno) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
 		case ESRCH:
 			if (lzc_exists(zhp->zfs_name)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source (%s) does not exist"),
 				    from);
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EBUSY:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "target is busy; if a filesystem, "
 			    "it must not be mounted"));
 			return (zfs_error(hdl, EZFS_BUSY, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 #ifdef illumos
 		case ENOSTR:
 #endif
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 	return (err != 0);
 }
 
 /*
  * Routines specific to "zfs recv"
  */
 
 static int
 recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen,
     boolean_t byteswap, zio_cksum_t *zc)
 {
 	char *cp = buf;
 	int rv;
 	int len = ilen;
 
 	assert(ilen <= SPA_MAXBLOCKSIZE);
 
 	do {
 		rv = read(fd, cp, len);
 		cp += rv;
 		len -= rv;
 	} while (rv > 0);
 
 	if (rv < 0 || len != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "failed to read from stream"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN,
 		    "cannot receive")));
 	}
 
 	if (zc) {
 		if (byteswap)
 			fletcher_4_incremental_byteswap(buf, ilen, zc);
 		else
 			fletcher_4_incremental_native(buf, ilen, zc);
 	}
 	return (0);
 }
 
 static int
 recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp,
     boolean_t byteswap, zio_cksum_t *zc)
 {
 	char *buf;
 	int err;
 
 	buf = zfs_alloc(hdl, len);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	err = recv_read(hdl, fd, buf, len, byteswap, zc);
 	if (err != 0) {
 		free(buf);
 		return (err);
 	}
 
 	err = nvlist_unpack(buf, len, nvp, 0);
 	free(buf);
 	if (err != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (malformed nvlist)"));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
     int baselen, char *newname, recvflags_t *flags)
 {
 	static int seq;
 	zfs_cmd_t zc = { 0 };
 	int err;
 	prop_changelist_t *clp;
 	zfs_handle_t *zhp;
 
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL)
 		return (-1);
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 	    flags->force ? MS_FORCE : 0);
 	zfs_close(zhp);
 	if (clp == NULL)
 		return (-1);
 	err = changelist_prefix(clp);
 	if (err)
 		return (err);
 
 	zc.zc_objset_type = DMU_OST_ZFS;
 	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
 
 	if (tryname) {
 		(void) strcpy(newname, tryname);
 
 		(void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value));
 
 		if (flags->verbose) {
 			(void) printf("attempting rename %s to %s\n",
 			    zc.zc_name, zc.zc_value);
 		}
 		err = ioctl(hdl->libzfs_fd, ZFS_IOC_RENAME, &zc);
 		if (err == 0)
 			changelist_rename(clp, name, tryname);
 	} else {
 		err = ENOENT;
 	}
 
 	if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) {
 		seq++;
 
 		(void) snprintf(newname, ZFS_MAXNAMELEN, "%.*srecv-%u-%u",
 		    baselen, name, getpid(), seq);
 		(void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value));
 
 		if (flags->verbose) {
 			(void) printf("failed - trying rename %s to %s\n",
 			    zc.zc_name, zc.zc_value);
 		}
 		err = ioctl(hdl->libzfs_fd, ZFS_IOC_RENAME, &zc);
 		if (err == 0)
 			changelist_rename(clp, name, newname);
 		if (err && flags->verbose) {
 			(void) printf("failed (%u) - "
 			    "will try again on next pass\n", errno);
 		}
 		err = EAGAIN;
 	} else if (flags->verbose) {
 		if (err == 0)
 			(void) printf("success\n");
 		else
 			(void) printf("failed (%u)\n", errno);
 	}
 
 	(void) changelist_postfix(clp);
 	changelist_free(clp);
 
 	return (err);
 }
 
 static int
 recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
     char *newname, recvflags_t *flags)
 {
 	zfs_cmd_t zc = { 0 };
 	int err = 0;
 	prop_changelist_t *clp;
 	zfs_handle_t *zhp;
 	boolean_t defer = B_FALSE;
 	int spa_version;
 
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL)
 		return (-1);
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 	    flags->force ? MS_FORCE : 0);
 	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
 	    zfs_spa_version(zhp, &spa_version) == 0 &&
 	    spa_version >= SPA_VERSION_USERREFS)
 		defer = B_TRUE;
 	zfs_close(zhp);
 	if (clp == NULL)
 		return (-1);
 	err = changelist_prefix(clp);
 	if (err)
 		return (err);
 
 	zc.zc_objset_type = DMU_OST_ZFS;
 	zc.zc_defer_destroy = defer;
 	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
 
 	if (flags->verbose)
 		(void) printf("attempting destroy %s\n", zc.zc_name);
 	err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc);
 	if (err == 0) {
 		if (flags->verbose)
 			(void) printf("success\n");
 		changelist_remove(clp, zc.zc_name);
 	}
 
 	(void) changelist_postfix(clp);
 	changelist_free(clp);
 
 	/*
 	 * Deferred destroy might destroy the snapshot or only mark it to be
 	 * destroyed later, and it returns success in either case.
 	 */
 	if (err != 0 || (defer && zfs_dataset_exists(hdl, name,
 	    ZFS_TYPE_SNAPSHOT))) {
 		err = recv_rename(hdl, name, NULL, baselen, newname, flags);
 	}
 
 	return (err);
 }
 
 typedef struct guid_to_name_data {
 	uint64_t guid;
 	boolean_t bookmark_ok;
 	char *name;
 	char *skip;
 } guid_to_name_data_t;
 
 static int
 guid_to_name_cb(zfs_handle_t *zhp, void *arg)
 {
 	guid_to_name_data_t *gtnd = arg;
 	const char *slash;
 	int err;
 
 	if (gtnd->skip != NULL &&
 	    (slash = strrchr(zhp->zfs_name, '/')) != NULL &&
 	    strcmp(slash + 1, gtnd->skip) == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) {
 		(void) strcpy(gtnd->name, zhp->zfs_name);
 		zfs_close(zhp);
 		return (EEXIST);
 	}
 
 	err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
 	if (err != EEXIST && gtnd->bookmark_ok)
 		err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd);
 	zfs_close(zhp);
 	return (err);
 }
 
 /*
  * Attempt to find the local dataset associated with this guid.  In the case of
  * multiple matches, we attempt to find the "best" match by searching
  * progressively larger portions of the hierarchy.  This allows one to send a
  * tree of datasets individually and guarantee that we will find the source
  * guid within that hierarchy, even if there are multiple matches elsewhere.
  */
 static int
 guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid,
     boolean_t bookmark_ok, char *name)
 {
 	char pname[ZFS_MAXNAMELEN];
 	guid_to_name_data_t gtnd;
 
 	gtnd.guid = guid;
 	gtnd.bookmark_ok = bookmark_ok;
 	gtnd.name = name;
 	gtnd.skip = NULL;
 
 	/*
 	 * Search progressively larger portions of the hierarchy, starting
 	 * with the filesystem specified by 'parent'.  This will
 	 * select the "most local" version of the origin snapshot in the case
 	 * that there are multiple matching snapshots in the system.
 	 */
 	(void) strlcpy(pname, parent, sizeof (pname));
 	char *cp = strrchr(pname, '@');
 	if (cp == NULL)
 		cp = strchr(pname, '\0');
 	for (; cp != NULL; cp = strrchr(pname, '/')) {
 		/* Chop off the last component and open the parent */
 		*cp = '\0';
 		zfs_handle_t *zhp = make_dataset_handle(hdl, pname);
 
 		if (zhp == NULL)
 			continue;
 		int err = guid_to_name_cb(zfs_handle_dup(zhp), &gtnd);
 		if (err != EEXIST)
 			err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
 		if (err != EEXIST && bookmark_ok)
 			err = zfs_iter_bookmarks(zhp, guid_to_name_cb, &gtnd);
 		zfs_close(zhp);
 		if (err == EEXIST)
 			return (0);
 
 		/*
 		 * Remember the last portion of the dataset so we skip it next
 		 * time through (as we've already searched that portion of the
 		 * hierarchy).
 		 */
 		gtnd.skip = strrchr(pname, '/') + 1;
 	}
 
 	return (ENOENT);
 }
 
 /*
  * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if
  * guid1 is after guid2.
  */
 static int
 created_before(libzfs_handle_t *hdl, avl_tree_t *avl,
     uint64_t guid1, uint64_t guid2)
 {
 	nvlist_t *nvfs;
 	char *fsname, *snapname;
 	char buf[ZFS_MAXNAMELEN];
 	int rv;
 	zfs_handle_t *guid1hdl, *guid2hdl;
 	uint64_t create1, create2;
 
 	if (guid2 == 0)
 		return (0);
 	if (guid1 == 0)
 		return (1);
 
 	nvfs = fsavl_find(avl, guid1, &snapname);
 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
 	guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
 	if (guid1hdl == NULL)
 		return (-1);
 
 	nvfs = fsavl_find(avl, guid2, &snapname);
 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
 	guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
 	if (guid2hdl == NULL) {
 		zfs_close(guid1hdl);
 		return (-1);
 	}
 
 	create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG);
 	create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG);
 
 	if (create1 < create2)
 		rv = -1;
 	else if (create1 > create2)
 		rv = +1;
 	else
 		rv = 0;
 
 	zfs_close(guid1hdl);
 	zfs_close(guid2hdl);
 
 	return (rv);
 }
 
 static int
 recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
     recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl,
     nvlist_t *renamed)
 {
 	nvlist_t *local_nv, *deleted = NULL;
 	avl_tree_t *local_avl;
 	nvpair_t *fselem, *nextfselem;
 	char *fromsnap;
 	char newname[ZFS_MAXNAMELEN];
 	char guidname[32];
 	int error;
 	boolean_t needagain, progress, recursive;
 	char *s1, *s2;
 
 	VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap));
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 
 	if (flags->dryrun)
 		return (0);
 
 again:
 	needagain = progress = B_FALSE;
 
 	VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0));
 
 	if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL,
 	    recursive, &local_nv, &local_avl)) != 0)
 		return (error);
 
 	/*
 	 * Process deletes and renames
 	 */
 	for (fselem = nvlist_next_nvpair(local_nv, NULL);
 	    fselem; fselem = nextfselem) {
 		nvlist_t *nvfs, *snaps;
 		nvlist_t *stream_nvfs = NULL;
 		nvpair_t *snapelem, *nextsnapelem;
 		uint64_t fromguid = 0;
 		uint64_t originguid = 0;
 		uint64_t stream_originguid = 0;
 		uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid;
 		char *fsname, *stream_fsname;
 
 		nextfselem = nvlist_next_nvpair(local_nv, fselem);
 
 		VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
 		VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
 		VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 		VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap",
 		    &parent_fromsnap_guid));
 		(void) nvlist_lookup_uint64(nvfs, "origin", &originguid);
 
 		/*
 		 * First find the stream's fs, so we can check for
 		 * a different origin (due to "zfs promote")
 		 */
 		for (snapelem = nvlist_next_nvpair(snaps, NULL);
 		    snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) {
 			uint64_t thisguid;
 
 			VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
 			stream_nvfs = fsavl_find(stream_avl, thisguid, NULL);
 
 			if (stream_nvfs != NULL)
 				break;
 		}
 
 		/* check for promote */
 		(void) nvlist_lookup_uint64(stream_nvfs, "origin",
 		    &stream_originguid);
 		if (stream_nvfs && originguid != stream_originguid) {
 			switch (created_before(hdl, local_avl,
 			    stream_originguid, originguid)) {
 			case 1: {
 				/* promote it! */
 				zfs_cmd_t zc = { 0 };
 				nvlist_t *origin_nvfs;
 				char *origin_fsname;
 
 				if (flags->verbose)
 					(void) printf("promoting %s\n", fsname);
 
 				origin_nvfs = fsavl_find(local_avl, originguid,
 				    NULL);
 				VERIFY(0 == nvlist_lookup_string(origin_nvfs,
 				    "name", &origin_fsname));
 				(void) strlcpy(zc.zc_value, origin_fsname,
 				    sizeof (zc.zc_value));
 				(void) strlcpy(zc.zc_name, fsname,
 				    sizeof (zc.zc_name));
 				error = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc);
 				if (error == 0)
 					progress = B_TRUE;
 				break;
 			}
 			default:
 				break;
 			case -1:
 				fsavl_destroy(local_avl);
 				nvlist_free(local_nv);
 				return (-1);
 			}
 			/*
 			 * We had/have the wrong origin, therefore our
 			 * list of snapshots is wrong.  Need to handle
 			 * them on the next pass.
 			 */
 			needagain = B_TRUE;
 			continue;
 		}
 
 		for (snapelem = nvlist_next_nvpair(snaps, NULL);
 		    snapelem; snapelem = nextsnapelem) {
 			uint64_t thisguid;
 			char *stream_snapname;
 			nvlist_t *found, *props;
 
 			nextsnapelem = nvlist_next_nvpair(snaps, snapelem);
 
 			VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
 			found = fsavl_find(stream_avl, thisguid,
 			    &stream_snapname);
 
 			/* check for delete */
 			if (found == NULL) {
 				char name[ZFS_MAXNAMELEN];
 
 				if (!flags->force)
 					continue;
 
 				(void) snprintf(name, sizeof (name), "%s@%s",
 				    fsname, nvpair_name(snapelem));
 
 				error = recv_destroy(hdl, name,
 				    strlen(fsname)+1, newname, flags);
 				if (error)
 					needagain = B_TRUE;
 				else
 					progress = B_TRUE;
 				sprintf(guidname, "%lu", thisguid);
 				nvlist_add_boolean(deleted, guidname);
 				continue;
 			}
 
 			stream_nvfs = found;
 
 			if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops",
 			    &props) && 0 == nvlist_lookup_nvlist(props,
 			    stream_snapname, &props)) {
 				zfs_cmd_t zc = { 0 };
 
 				zc.zc_cookie = B_TRUE; /* received */
 				(void) snprintf(zc.zc_name, sizeof (zc.zc_name),
 				    "%s@%s", fsname, nvpair_name(snapelem));
 				if (zcmd_write_src_nvlist(hdl, &zc,
 				    props) == 0) {
 					(void) zfs_ioctl(hdl,
 					    ZFS_IOC_SET_PROP, &zc);
 					zcmd_free_nvlists(&zc);
 				}
 			}
 
 			/* check for different snapname */
 			if (strcmp(nvpair_name(snapelem),
 			    stream_snapname) != 0) {
 				char name[ZFS_MAXNAMELEN];
 				char tryname[ZFS_MAXNAMELEN];
 
 				(void) snprintf(name, sizeof (name), "%s@%s",
 				    fsname, nvpair_name(snapelem));
 				(void) snprintf(tryname, sizeof (name), "%s@%s",
 				    fsname, stream_snapname);
 
 				error = recv_rename(hdl, name, tryname,
 				    strlen(fsname)+1, newname, flags);
 				if (error)
 					needagain = B_TRUE;
 				else
 					progress = B_TRUE;
 			}
 
 			if (strcmp(stream_snapname, fromsnap) == 0)
 				fromguid = thisguid;
 		}
 
 		/* check for delete */
 		if (stream_nvfs == NULL) {
 			if (!flags->force)
 				continue;
 
 			error = recv_destroy(hdl, fsname, strlen(tofs)+1,
 			    newname, flags);
 			if (error)
 				needagain = B_TRUE;
 			else
 				progress = B_TRUE;
 			sprintf(guidname, "%lu", parent_fromsnap_guid);
 			nvlist_add_boolean(deleted, guidname);
 			continue;
 		}
 
 		if (fromguid == 0) {
 			if (flags->verbose) {
 				(void) printf("local fs %s does not have "
 				    "fromsnap (%s in stream); must have "
 				    "been deleted locally; ignoring\n",
 				    fsname, fromsnap);
 			}
 			continue;
 		}
 
 		VERIFY(0 == nvlist_lookup_string(stream_nvfs,
 		    "name", &stream_fsname));
 		VERIFY(0 == nvlist_lookup_uint64(stream_nvfs,
 		    "parentfromsnap", &stream_parent_fromsnap_guid));
 
 		s1 = strrchr(fsname, '/');
 		s2 = strrchr(stream_fsname, '/');
 
 		/*
 		 * Check if we're going to rename based on parent guid change
 		 * and the current parent guid was also deleted. If it was then
 		 * rename will fail and is likely unneeded, so avoid this and
 		 * force an early retry to determine the new
 		 * parent_fromsnap_guid.
 		 */
 		if (stream_parent_fromsnap_guid != 0 &&
                     parent_fromsnap_guid != 0 &&
                     stream_parent_fromsnap_guid != parent_fromsnap_guid) {
 			sprintf(guidname, "%lu", parent_fromsnap_guid);
 			if (nvlist_exists(deleted, guidname)) {
 				progress = B_TRUE;
 				needagain = B_TRUE;
 				goto doagain;
 			}
 		}
 
 		/*
 		 * Check for rename. If the exact receive path is specified, it
 		 * does not count as a rename, but we still need to check the
 		 * datasets beneath it.
 		 */
 		if ((stream_parent_fromsnap_guid != 0 &&
 		    parent_fromsnap_guid != 0 &&
 		    stream_parent_fromsnap_guid != parent_fromsnap_guid) ||
 		    ((flags->isprefix || strcmp(tofs, fsname) != 0) &&
 		    (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
 			nvlist_t *parent;
 			char tryname[ZFS_MAXNAMELEN];
 
 			parent = fsavl_find(local_avl,
 			    stream_parent_fromsnap_guid, NULL);
 			/*
 			 * NB: parent might not be found if we used the
 			 * tosnap for stream_parent_fromsnap_guid,
 			 * because the parent is a newly-created fs;
 			 * we'll be able to rename it after we recv the
 			 * new fs.
 			 */
 			if (parent != NULL) {
 				char *pname;
 
 				VERIFY(0 == nvlist_lookup_string(parent, "name",
 				    &pname));
 				(void) snprintf(tryname, sizeof (tryname),
 				    "%s%s", pname, strrchr(stream_fsname, '/'));
 			} else {
 				tryname[0] = '\0';
 				if (flags->verbose) {
 					(void) printf("local fs %s new parent "
 					    "not found\n", fsname);
 				}
 			}
 
 			newname[0] = '\0';
 
 			error = recv_rename(hdl, fsname, tryname,
 			    strlen(tofs)+1, newname, flags);
 
 			if (renamed != NULL && newname[0] != '\0') {
 				VERIFY(0 == nvlist_add_boolean(renamed,
 				    newname));
 			}
 
 			if (error)
 				needagain = B_TRUE;
 			else
 				progress = B_TRUE;
 		}
 	}
 
 doagain:
 	fsavl_destroy(local_avl);
 	nvlist_free(local_nv);
 	nvlist_free(deleted);
 
 	if (needagain && progress) {
 		/* do another pass to fix up temporary names */
 		if (flags->verbose)
 			(void) printf("another pass:\n");
 		goto again;
 	}
 
 	return (needagain);
 }
 
 static int
 zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
     recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc,
     char **top_zfs, int cleanup_fd, uint64_t *action_handlep)
 {
 	nvlist_t *stream_nv = NULL;
 	avl_tree_t *stream_avl = NULL;
 	char *fromsnap = NULL;
 	char *sendsnap = NULL;
 	char *cp;
 	char tofs[ZFS_MAXNAMELEN];
 	char sendfs[ZFS_MAXNAMELEN];
 	char errbuf[1024];
 	dmu_replay_record_t drre;
 	int error;
 	boolean_t anyerr = B_FALSE;
 	boolean_t softerr = B_FALSE;
 	boolean_t recursive;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	assert(drr->drr_type == DRR_BEGIN);
 	assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC);
 	assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM);
 
 	/*
 	 * Read in the nvlist from the stream.
 	 */
 	if (drr->drr_payloadlen != 0) {
 		error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen,
 		    &stream_nv, flags->byteswap, zc);
 		if (error) {
 			error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			goto out;
 		}
 	}
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 
 	if (recursive && strchr(destname, '@')) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "cannot specify snapshot name for multi-snapshot stream"));
 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 
 	/*
 	 * Read in the end record and verify checksum.
 	 */
 	if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre),
 	    flags->byteswap, NULL)))
 		goto out;
 	if (flags->byteswap) {
 		drre.drr_type = BSWAP_32(drre.drr_type);
 		drre.drr_u.drr_end.drr_checksum.zc_word[0] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]);
 		drre.drr_u.drr_end.drr_checksum.zc_word[1] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]);
 		drre.drr_u.drr_end.drr_checksum.zc_word[2] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]);
 		drre.drr_u.drr_end.drr_checksum.zc_word[3] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]);
 	}
 	if (drre.drr_type != DRR_END) {
 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 	if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "incorrect header checksum"));
 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 
 	(void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap);
 
 	if (drr->drr_payloadlen != 0) {
 		nvlist_t *stream_fss;
 
 		VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss",
 		    &stream_fss));
 		if ((stream_avl = fsavl_create(stream_fss)) == NULL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "couldn't allocate avl tree"));
 			error = zfs_error(hdl, EZFS_NOMEM, errbuf);
 			goto out;
 		}
 
 		if (fromsnap != NULL) {
 			nvlist_t *renamed = NULL;
 			nvpair_t *pair = NULL;
 
 			(void) strlcpy(tofs, destname, ZFS_MAXNAMELEN);
 			if (flags->isprefix) {
 				struct drr_begin *drrb = &drr->drr_u.drr_begin;
 				int i;
 
 				if (flags->istail) {
 					cp = strrchr(drrb->drr_toname, '/');
 					if (cp == NULL) {
 						(void) strlcat(tofs, "/",
 						    ZFS_MAXNAMELEN);
 						i = 0;
 					} else {
 						i = (cp - drrb->drr_toname);
 					}
 				} else {
 					i = strcspn(drrb->drr_toname, "/@");
 				}
 				/* zfs_receive_one() will create_parents() */
 				(void) strlcat(tofs, &drrb->drr_toname[i],
 				    ZFS_MAXNAMELEN);
 				*strchr(tofs, '@') = '\0';
 			}
 
 			if (recursive && !flags->dryrun && !flags->nomount) {
 				VERIFY(0 == nvlist_alloc(&renamed,
 				    NV_UNIQUE_NAME, 0));
 			}
 
 			softerr = recv_incremental_replication(hdl, tofs, flags,
 			    stream_nv, stream_avl, renamed);
 
 			/* Unmount renamed filesystems before receiving. */
 			while ((pair = nvlist_next_nvpair(renamed,
 			    pair)) != NULL) {
 				zfs_handle_t *zhp;
 				prop_changelist_t *clp = NULL;
 
 				zhp = zfs_open(hdl, nvpair_name(pair),
 				    ZFS_TYPE_FILESYSTEM);
 				if (zhp != NULL) {
 					clp = changelist_gather(zhp,
 					    ZFS_PROP_MOUNTPOINT, 0, 0);
 					zfs_close(zhp);
 					if (clp != NULL) {
 						softerr |=
 						    changelist_prefix(clp);
 						changelist_free(clp);
 					}
 				}
 			}
 
 			nvlist_free(renamed);
 		}
 	}
 
 	/*
 	 * Get the fs specified by the first path in the stream (the top level
 	 * specified by 'zfs send') and pass it to each invocation of
 	 * zfs_receive_one().
 	 */
 	(void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname,
 	    ZFS_MAXNAMELEN);
 	if ((cp = strchr(sendfs, '@')) != NULL) {
 		*cp = '\0';
 		/*
 		 * Find the "sendsnap", the final snapshot in a replication
 		 * stream.  zfs_receive_one() handles certain errors
 		 * differently, depending on if the contained stream is the
 		 * last one or not.
 		 */
 		sendsnap = (cp + 1);
 	}
 
 	/* Finally, receive each contained stream */
 	do {
 		/*
 		 * we should figure out if it has a recoverable
 		 * error, in which case do a recv_skip() and drive on.
 		 * Note, if we fail due to already having this guid,
 		 * zfs_receive_one() will take care of it (ie,
 		 * recv_skip() and return 0).
 		 */
 		error = zfs_receive_impl(hdl, destname, NULL, flags, fd,
 		    sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
 		    action_handlep, sendsnap);
 		if (error == ENODATA) {
 			error = 0;
 			break;
 		}
 		anyerr |= error;
 	} while (error == 0);
 
 	if (drr->drr_payloadlen != 0 && fromsnap != NULL) {
 		/*
 		 * Now that we have the fs's they sent us, try the
 		 * renames again.
 		 */
 		softerr = recv_incremental_replication(hdl, tofs, flags,
 		    stream_nv, stream_avl, NULL);
 	}
 
 out:
 	fsavl_destroy(stream_avl);
 	if (stream_nv)
 		nvlist_free(stream_nv);
 	if (softerr)
 		error = -2;
 	if (anyerr)
 		error = -1;
 	return (error);
 }
 
 static void
 trunc_prop_errs(int truncated)
 {
 	ASSERT(truncated != 0);
 
 	if (truncated == 1)
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "1 more property could not be set\n"));
 	else
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "%d more properties could not be set\n"), truncated);
 }
 
 static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
 	dmu_replay_record_t *drr;
 	void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive:"));
 
 	/* XXX would be great to use lseek if possible... */
 	drr = buf;
 
 	while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t),
 	    byteswap, NULL) == 0) {
 		if (byteswap)
 			drr->drr_type = BSWAP_32(drr->drr_type);
 
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
 			if (drr->drr_payloadlen != 0) {
 				(void) recv_read(hdl, fd, buf,
 				    drr->drr_payloadlen, B_FALSE, NULL);
 			}
 			break;
 
 		case DRR_END:
 			free(buf);
 			return (0);
 
 		case DRR_OBJECT:
 			if (byteswap) {
 				drr->drr_u.drr_object.drr_bonuslen =
 				    BSWAP_32(drr->drr_u.drr_object.
 				    drr_bonuslen);
 			}
 			(void) recv_read(hdl, fd, buf,
 			    P2ROUNDUP(drr->drr_u.drr_object.drr_bonuslen, 8),
 			    B_FALSE, NULL);
 			break;
 
 		case DRR_WRITE:
 			if (byteswap) {
 				drr->drr_u.drr_write.drr_length =
 				    BSWAP_64(drr->drr_u.drr_write.drr_length);
 			}
 			(void) recv_read(hdl, fd, buf,
 			    drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
 			break;
 		case DRR_SPILL:
 			if (byteswap) {
 				drr->drr_u.drr_write.drr_length =
 				    BSWAP_64(drr->drr_u.drr_spill.drr_length);
 			}
 			(void) recv_read(hdl, fd, buf,
 			    drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
 			break;
 		case DRR_WRITE_EMBEDDED:
 			if (byteswap) {
 				drr->drr_u.drr_write_embedded.drr_psize =
 				    BSWAP_32(drr->drr_u.drr_write_embedded.
 				    drr_psize);
 			}
 			(void) recv_read(hdl, fd, buf,
 			    P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize,
 			    8), B_FALSE, NULL);
 			break;
 		case DRR_WRITE_BYREF:
 		case DRR_FREEOBJECTS:
 		case DRR_FREE:
 			break;
 
 		default:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid record type"));
 			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 		}
 	}
 
 	free(buf);
 	return (-1);
 }
 
 static void
 recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap,
     boolean_t resumable)
 {
 	char target_fs[ZFS_MAXNAMELEN];
 
 	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 	    "checksum mismatch or incomplete stream"));
 
 	if (!resumable)
 		return;
 	(void) strlcpy(target_fs, target_snap, sizeof (target_fs));
 	*strchr(target_fs, '@') = '\0';
 	zfs_handle_t *zhp = zfs_open(hdl, target_fs,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return;
 
 	char token_buf[ZFS_MAXPROPLEN];
 	int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	    token_buf, sizeof (token_buf),
 	    NULL, NULL, 0, B_TRUE);
 	if (error == 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "checksum mismatch or incomplete stream.\n"
 		    "Partially received snapshot is saved.\n"
 		    "A resuming stream can be generated on the sending "
 		    "system by running:\n"
 		    "    zfs send -t %s"),
 		    token_buf);
 	}
 	zfs_close(zhp);
 }
 
 /*
  * Restores a backup of tosnap from the file descriptor specified by infd.
  */
 static int
 zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
     const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr,
     dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv,
     avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
     uint64_t *action_handlep, const char *finalsnap)
 {
 	zfs_cmd_t zc = { 0 };
 	time_t begin_time;
 	int ioctl_err, ioctl_errno, err;
 	char *cp;
 	struct drr_begin *drrb = &drr->drr_u.drr_begin;
 	char errbuf[1024];
 	char prop_errbuf[1024];
 	const char *chopprefix;
 	boolean_t newfs = B_FALSE;
 	boolean_t stream_wantsnewfs;
 	uint64_t parent_snapguid = 0;
 	prop_changelist_t *clp = NULL;
 	nvlist_t *snapprops_nvlist = NULL;
 	zprop_errflags_t prop_errflags;
 	boolean_t recursive;
 	char *snapname = NULL;
 
 	begin_time = time(NULL);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 
 	if (stream_avl != NULL) {
 		nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid,
 		    &snapname);
 		nvlist_t *props;
 		int ret;
 
 		(void) nvlist_lookup_uint64(fs, "parentfromsnap",
 		    &parent_snapguid);
 		err = nvlist_lookup_nvlist(fs, "props", &props);
 		if (err)
 			VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0));
 
 		if (flags->canmountoff) {
 			VERIFY(0 == nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0));
 		}
 		ret = zcmd_write_src_nvlist(hdl, &zc, props);
 		if (err)
 			nvlist_free(props);
 
 		if (0 == nvlist_lookup_nvlist(fs, "snapprops", &props)) {
 			VERIFY(0 == nvlist_lookup_nvlist(props,
 			    snapname, &snapprops_nvlist));
 		}
 
 		if (ret != 0)
 			return (-1);
 	}
 
 	cp = NULL;
 
 	/*
 	 * Determine how much of the snapshot name stored in the stream
 	 * we are going to tack on to the name they specified on the
 	 * command line, and how much we are going to chop off.
 	 *
 	 * If they specified a snapshot, chop the entire name stored in
 	 * the stream.
 	 */
 	if (flags->istail) {
 		/*
 		 * A filesystem was specified with -e. We want to tack on only
 		 * the tail of the sent snapshot path.
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 			    "argument - snapshot not allowed with -e"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 
 		chopprefix = strrchr(sendfs, '/');
 
 		if (chopprefix == NULL) {
 			/*
 			 * The tail is the poolname, so we need to
 			 * prepend a path separator.
 			 */
 			int len = strlen(drrb->drr_toname);
 			cp = malloc(len + 2);
 			cp[0] = '/';
 			(void) strcpy(&cp[1], drrb->drr_toname);
 			chopprefix = cp;
 		} else {
 			chopprefix = drrb->drr_toname + (chopprefix - sendfs);
 		}
 	} else if (flags->isprefix) {
 		/*
 		 * A filesystem was specified with -d. We want to tack on
 		 * everything but the first element of the sent snapshot path
 		 * (all but the pool name).
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 			    "argument - snapshot not allowed with -d"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 
 		chopprefix = strchr(drrb->drr_toname, '/');
 		if (chopprefix == NULL)
 			chopprefix = strchr(drrb->drr_toname, '@');
 	} else if (strchr(tosnap, '@') == NULL) {
 		/*
 		 * If a filesystem was specified without -d or -e, we want to
 		 * tack on everything after the fs specified by 'zfs send'.
 		 */
 		chopprefix = drrb->drr_toname + strlen(sendfs);
 	} else {
 		/* A snapshot was specified as an exact path (no -d or -e). */
 		if (recursive) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "cannot specify snapshot name for multi-snapshot "
 			    "stream"));
 			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 		}
 		chopprefix = drrb->drr_toname + strlen(drrb->drr_toname);
 	}
 
 	ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname);
 	ASSERT(chopprefix > drrb->drr_toname);
 	ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname));
 	ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' ||
 	    chopprefix[0] == '\0');
 
 	/*
 	 * Determine name of destination snapshot, store in zc_value.
 	 */
 	(void) strcpy(zc.zc_value, tosnap);
 	(void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value));
 #ifdef __FreeBSD__
 	if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
 		zfs_ioctl_version = get_zfs_ioctl_version();
 	/*
 	 * For forward compatibility hide tosnap in zc_value
 	 */
 	if (zfs_ioctl_version < ZFS_IOCVER_LZC)
 		(void) strcpy(zc.zc_value + strlen(zc.zc_value) + 1, tosnap);
 #endif
 	free(cp);
 	if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) {
 		zcmd_free_nvlists(&zc);
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	}
 
 	/*
 	 * Determine the name of the origin snapshot, store in zc_string.
 	 */
 	if (drrb->drr_flags & DRR_FLAG_CLONE) {
 		if (guid_to_name(hdl, zc.zc_value,
 		    drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) {
 			zcmd_free_nvlists(&zc);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "local origin for clone %s does not exist"),
 			    zc.zc_value);
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 		}
 		if (flags->verbose)
 			(void) printf("found clone origin %s\n", zc.zc_string);
 	} else if (originsnap) {
 		(void) strncpy(zc.zc_string, originsnap, ZFS_MAXNAMELEN);
 		if (flags->verbose)
 			(void) printf("using provided clone origin %s\n",
 			    zc.zc_string);
 	}
 
 	boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
 	    DMU_BACKUP_FEATURE_RESUMING;
 	stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
 	    (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming;
 
 	if (stream_wantsnewfs) {
 		/*
 		 * if the parent fs does not exist, look for it based on
 		 * the parent snap GUID
 		 */
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot receive new filesystem stream"));
 
 		(void) strcpy(zc.zc_name, zc.zc_value);
 		cp = strrchr(zc.zc_name, '/');
 		if (cp)
 			*cp = '\0';
 		if (cp &&
 		    !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
 			char suffix[ZFS_MAXNAMELEN];
 			(void) strcpy(suffix, strrchr(zc.zc_value, '/'));
 			if (guid_to_name(hdl, zc.zc_name, parent_snapguid,
 			    B_FALSE, zc.zc_value) == 0) {
 				*strchr(zc.zc_value, '@') = '\0';
 				(void) strcat(zc.zc_value, suffix);
 			}
 		}
 	} else {
 		/*
 		 * if the fs does not exist, look for it based on the
 		 * fromsnap GUID
 		 */
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot receive incremental stream"));
 
 		(void) strcpy(zc.zc_name, zc.zc_value);
 		*strchr(zc.zc_name, '@') = '\0';
 
 		/*
 		 * If the exact receive path was specified and this is the
 		 * topmost path in the stream, then if the fs does not exist we
 		 * should look no further.
 		 */
 		if ((flags->isprefix || (*(chopprefix = drrb->drr_toname +
 		    strlen(sendfs)) != '\0' && *chopprefix != '@')) &&
 		    !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
 			char snap[ZFS_MAXNAMELEN];
 			(void) strcpy(snap, strchr(zc.zc_value, '@'));
 			if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid,
 			    B_FALSE, zc.zc_value) == 0) {
 				*strchr(zc.zc_value, '@') = '\0';
 				(void) strcat(zc.zc_value, snap);
 			}
 		}
 	}
 
 	(void) strcpy(zc.zc_name, zc.zc_value);
 	*strchr(zc.zc_name, '@') = '\0';
 
 	if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
 		zfs_handle_t *zhp;
 
 		/*
 		 * Destination fs exists.  It must be one of these cases:
 		 *  - an incremental send stream
 		 *  - the stream specifies a new fs (full stream or clone)
 		 *    and they want us to blow away the existing fs (and
 		 *    have therefore specified -F and removed any snapshots)
 		 *  - we are resuming a failed receive.
 		 */
 		if (stream_wantsnewfs) {
 			if (!flags->force) {
 				zcmd_free_nvlists(&zc);
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination '%s' exists\n"
 				    "must specify -F to overwrite it"),
 				    zc.zc_name);
 				return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 			}
 			if (ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT,
 			    &zc) == 0) {
 				zcmd_free_nvlists(&zc);
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination has snapshots (eg. %s)\n"
 				    "must destroy them to overwrite it"),
 				    zc.zc_name);
 				return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 			}
 		}
 
 		if ((zhp = zfs_open(hdl, zc.zc_name,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) {
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 
 		if (stream_wantsnewfs &&
 		    zhp->zfs_dmustats.dds_origin[0]) {
 			zcmd_free_nvlists(&zc);
 			zfs_close(zhp);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination '%s' is a clone\n"
 			    "must destroy it to overwrite it"),
 			    zc.zc_name);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 		}
 
 		if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
 		    stream_wantsnewfs) {
 			/* We can't do online recv in this case */
 			clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0);
 			if (clp == NULL) {
 				zfs_close(zhp);
 				zcmd_free_nvlists(&zc);
 				return (-1);
 			}
 			if (changelist_prefix(clp) != 0) {
 				changelist_free(clp);
 				zfs_close(zhp);
 				zcmd_free_nvlists(&zc);
 				return (-1);
 			}
 		}
 
 		/*
 		 * If we are resuming a newfs, set newfs here so that we will
 		 * mount it if the recv succeeds this time.  We can tell
 		 * that it was a newfs on the first recv because the fs
 		 * itself will be inconsistent (if the fs existed when we
 		 * did the first recv, we would have received it into
 		 * .../%recv).
 		 */
 		if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT))
 			newfs = B_TRUE;
 
 		zfs_close(zhp);
 	} else {
 		/*
 		 * Destination filesystem does not exist.  Therefore we better
 		 * be creating a new filesystem (either from a full backup, or
 		 * a clone).  It would therefore be invalid if the user
 		 * specified only the pool name (i.e. if the destination name
 		 * contained no slash character).
 		 */
 		if (!stream_wantsnewfs ||
 		    (cp = strrchr(zc.zc_name, '/')) == NULL) {
 			zcmd_free_nvlists(&zc);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination '%s' does not exist"), zc.zc_name);
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 		}
 
 		/*
 		 * Trim off the final dataset component so we perform the
 		 * recvbackup ioctl to the filesystems's parent.
 		 */
 		*cp = '\0';
 
 		if (flags->isprefix && !flags->istail && !flags->dryrun &&
 		    create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) {
 			zcmd_free_nvlists(&zc);
 			return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
 		}
 
 		newfs = B_TRUE;
 	}
 
 	zc.zc_begin_record = *drr_noswap;
 	zc.zc_cookie = infd;
 	zc.zc_guid = flags->force;
 	zc.zc_resumable = flags->resumable;
 	if (flags->verbose) {
 		(void) printf("%s %s stream of %s into %s\n",
 		    flags->dryrun ? "would receive" : "receiving",
 		    drrb->drr_fromguid ? "incremental" : "full",
 		    drrb->drr_toname, zc.zc_value);
 		(void) fflush(stdout);
 	}
 
 	if (flags->dryrun) {
 		zcmd_free_nvlists(&zc);
 		return (recv_skip(hdl, infd, flags->byteswap));
 	}
 
 	zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf;
 	zc.zc_nvlist_dst_size = sizeof (prop_errbuf);
 	zc.zc_cleanup_fd = cleanup_fd;
 	zc.zc_action_handle = *action_handlep;
 
 	err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc);
 	ioctl_errno = errno;
 	prop_errflags = (zprop_errflags_t)zc.zc_obj;
 
 	if (err == 0) {
 		nvlist_t *prop_errors;
 		VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
 		    zc.zc_nvlist_dst_size, &prop_errors, 0));
 
 		nvpair_t *prop_err = NULL;
 
 		while ((prop_err = nvlist_next_nvpair(prop_errors,
 		    prop_err)) != NULL) {
 			char tbuf[1024];
 			zfs_prop_t prop;
 			int intval;
 
 			prop = zfs_name_to_prop(nvpair_name(prop_err));
 			(void) nvpair_value_int32(prop_err, &intval);
 			if (strcmp(nvpair_name(prop_err),
 			    ZPROP_N_MORE_ERRORS) == 0) {
 				trunc_prop_errs(intval);
 				break;
 			} else if (snapname == NULL || finalsnap == NULL ||
 			    strcmp(finalsnap, snapname) == 0 ||
 			    strcmp(nvpair_name(prop_err),
 			    zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) {
 				/*
 				 * Skip the special case of, for example,
 				 * "refquota", errors on intermediate
 				 * snapshots leading up to a final one.
 				 * That's why we have all of the checks above.
 				 *
 				 * See zfs_ioctl.c's extract_delay_props() for
 				 * a list of props which can fail on
 				 * intermediate snapshots, but shouldn't
 				 * affect the overall receive.
 				 */
 				(void) snprintf(tbuf, sizeof (tbuf),
 				    dgettext(TEXT_DOMAIN,
 				    "cannot receive %s property on %s"),
 				    nvpair_name(prop_err), zc.zc_name);
 				zfs_setprop_error(hdl, prop, intval, tbuf);
 			}
 		}
 		nvlist_free(prop_errors);
 	}
 
 	zc.zc_nvlist_dst = 0;
 	zc.zc_nvlist_dst_size = 0;
 	zcmd_free_nvlists(&zc);
 
 	if (err == 0 && snapprops_nvlist) {
 		zfs_cmd_t zc2 = { 0 };
 
 		(void) strcpy(zc2.zc_name, zc.zc_value);
 		zc2.zc_cookie = B_TRUE; /* received */
 		if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) {
 			(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2);
 			zcmd_free_nvlists(&zc2);
 		}
 	}
 
 	if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) {
 		/*
 		 * It may be that this snapshot already exists,
 		 * in which case we want to consume & ignore it
 		 * rather than failing.
 		 */
 		avl_tree_t *local_avl;
 		nvlist_t *local_nv, *fs;
 		cp = strchr(zc.zc_value, '@');
 
 		/*
 		 * XXX Do this faster by just iterating over snaps in
 		 * this fs.  Also if zc_value does not exist, we will
 		 * get a strange "does not exist" error message.
 		 */
 		*cp = '\0';
 		if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE,
 		    &local_nv, &local_avl) == 0) {
 			*cp = '@';
 			fs = fsavl_find(local_avl, drrb->drr_toguid, NULL);
 			fsavl_destroy(local_avl);
 			nvlist_free(local_nv);
 
 			if (fs != NULL) {
 				if (flags->verbose) {
 					(void) printf("snap %s already exists; "
 					    "ignoring\n", zc.zc_value);
 				}
 				err = ioctl_err = recv_skip(hdl, infd,
 				    flags->byteswap);
 			}
 		}
 		*cp = '@';
 	}
 
 	if (ioctl_err != 0) {
 		switch (ioctl_errno) {
 		case ENODEV:
 			cp = strchr(zc.zc_value, '@');
 			*cp = '\0';
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "most recent snapshot of %s does not\n"
 			    "match incremental source"), zc.zc_value);
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			*cp = '@';
 			break;
 		case ETXTBSY:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination %s has been modified\n"
 			    "since most recent snapshot"), zc.zc_name);
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			break;
 		case EEXIST:
 			cp = strchr(zc.zc_value, '@');
 			if (newfs) {
 				/* it's the containing fs that exists */
 				*cp = '\0';
 			}
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination already exists"));
 			(void) zfs_error_fmt(hdl, EZFS_EXISTS,
 			    dgettext(TEXT_DOMAIN, "cannot restore to %s"),
 			    zc.zc_value);
 			*cp = '@';
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ECKSUM:
 			recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable);
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded to receive this stream."));
 			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EDQUOT:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination %s space quota exceeded"), zc.zc_name);
 			(void) zfs_error(hdl, EZFS_NOSPC, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl, ioctl_errno, errbuf);
 		}
 	}
 
 	/*
 	 * Mount the target filesystem (if created).  Also mount any
 	 * children of the target filesystem if we did a replication
 	 * receive (indicated by stream_avl being non-NULL).
 	 */
 	cp = strchr(zc.zc_value, '@');
 	if (cp && (ioctl_err == 0 || !newfs)) {
 		zfs_handle_t *h;
 
 		*cp = '\0';
 		h = zfs_open(hdl, zc.zc_value,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		if (h != NULL) {
 			if (h->zfs_type == ZFS_TYPE_VOLUME) {
 				*cp = '@';
 			} else if (newfs || stream_avl) {
 				/*
 				 * Track the first/top of hierarchy fs,
 				 * for mounting and sharing later.
 				 */
 				if (top_zfs && *top_zfs == NULL)
 					*top_zfs = zfs_strdup(hdl, zc.zc_value);
 			}
 			zfs_close(h);
 		}
 		*cp = '@';
 	}
 
 	if (clp) {
 		err |= changelist_postfix(clp);
 		changelist_free(clp);
 	}
 
 	if (prop_errflags & ZPROP_ERR_NOCLEAR) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
 		    "failed to clear unreceived properties on %s"),
 		    zc.zc_name);
 		(void) fprintf(stderr, "\n");
 	}
 	if (prop_errflags & ZPROP_ERR_NORESTORE) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
 		    "failed to restore original properties on %s"),
 		    zc.zc_name);
 		(void) fprintf(stderr, "\n");
 	}
 
 	if (err || ioctl_err)
 		return (-1);
 
 	*action_handlep = zc.zc_action_handle;
 
 	if (flags->verbose) {
 		char buf1[64];
 		char buf2[64];
 		uint64_t bytes = zc.zc_cookie;
 		time_t delta = time(NULL) - begin_time;
 		if (delta == 0)
 			delta = 1;
 		zfs_nicenum(bytes, buf1, sizeof (buf1));
 		zfs_nicenum(bytes/delta, buf2, sizeof (buf1));
 
 		(void) printf("received %sB stream in %lu seconds (%sB/sec)\n",
 		    buf1, delta, buf2);
 	}
 
 	return (0);
 }
 
 static int
 zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
     const char *originsnap, recvflags_t *flags, int infd, const char *sendfs,
     nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
     uint64_t *action_handlep, const char *finalsnap)
 {
 	int err;
 	dmu_replay_record_t drr, drr_noswap;
 	struct drr_begin *drrb = &drr.drr_u.drr_begin;
 	char errbuf[1024];
 	zio_cksum_t zcksum = { 0 };
 	uint64_t featureflags;
 	int hdrtype;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	if (flags->isprefix &&
 	    !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs "
 		    "(%s) does not exist"), tosnap);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 	if (originsnap &&
 	    !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs "
 		    "(%s) does not exist"), originsnap);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 
 	/* read in the BEGIN record */
 	if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
 	    &zcksum)))
 		return (err);
 
 	if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) {
 		/* It's the double end record at the end of a package */
 		return (ENODATA);
 	}
 
 	/* the kernel needs the non-byteswapped begin record */
 	drr_noswap = drr;
 
 	flags->byteswap = B_FALSE;
 	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		/*
 		 * We computed the checksum in the wrong byteorder in
 		 * recv_read() above; do it again correctly.
 		 */
 		bzero(&zcksum, sizeof (zio_cksum_t));
 		fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum);
 		flags->byteswap = B_TRUE;
 
 		drr.drr_type = BSWAP_32(drr.drr_type);
 		drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen);
 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
 		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
 		drrb->drr_type = BSWAP_32(drrb->drr_type);
 		drrb->drr_flags = BSWAP_32(drrb->drr_flags);
 		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
 		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
 	}
 
 	if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad magic number)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo);
 
 	if (!DMU_STREAM_SUPPORTED(featureflags) ||
 	    (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "stream has unsupported feature, feature flags = %lx"),
 		    featureflags);
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	if (strchr(drrb->drr_toname, '@') == NULL) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad snapshot name)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) {
 		char nonpackage_sendfs[ZFS_MAXNAMELEN];
 		if (sendfs == NULL) {
 			/*
 			 * We were not called from zfs_receive_package(). Get
 			 * the fs specified by 'zfs send'.
 			 */
 			char *cp;
 			(void) strlcpy(nonpackage_sendfs,
 			    drr.drr_u.drr_begin.drr_toname, ZFS_MAXNAMELEN);
 			if ((cp = strchr(nonpackage_sendfs, '@')) != NULL)
 				*cp = '\0';
 			sendfs = nonpackage_sendfs;
 			VERIFY(finalsnap == NULL);
 		}
 		return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags,
 		    &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs,
 		    cleanup_fd, action_handlep, finalsnap));
 	} else {
 		assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 		    DMU_COMPOUNDSTREAM);
 		return (zfs_receive_package(hdl, infd, tosnap, flags, &drr,
 		    &zcksum, top_zfs, cleanup_fd, action_handlep));
 	}
 }
 
 /*
  * Restores a backup of tosnap from the file descriptor specified by infd.
  * Return 0 on total success, -2 if some things couldn't be
  * destroyed/renamed/promoted, -1 if some things couldn't be received.
  * (-1 will override -2, if -1 and the resumable flag was specified the
  * transfer can be resumed if the sending side supports it).
  */
 int
 zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
     recvflags_t *flags, int infd, avl_tree_t *stream_avl)
 {
 	char *top_zfs = NULL;
 	int err;
 	int cleanup_fd;
 	uint64_t action_handle = 0;
 	char *originsnap = NULL;
 	if (props) {
 		err = nvlist_lookup_string(props, "origin", &originsnap);
 		if (err && err != ENOENT)
 			return (err);
 	}
 
 	cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
 	VERIFY(cleanup_fd >= 0);
 
 	err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL,
 	    stream_avl, &top_zfs, cleanup_fd, &action_handle, NULL);
 
 	VERIFY(0 == close(cleanup_fd));
 
 	if (err == 0 && !flags->nomount && top_zfs) {
 		zfs_handle_t *zhp;
 		prop_changelist_t *clp;
 
 		zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM);
 		if (zhp != NULL) {
 			clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT,
 			    CL_GATHER_MOUNT_ALWAYS, 0);
 			zfs_close(zhp);
 			if (clp != NULL) {
 				/* mount and share received datasets */
 				err = changelist_postfix(clp);
 				changelist_free(clp);
 			}
 		}
 		if (zhp == NULL || clp == NULL || err)
 			err = -1;
 	}
 	if (top_zfs)
 		free(top_zfs);
 
 	return (err);
 }
Index: head/cddl/contrib/opensolaris/lib/libzfs
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs	(revision 296518)
+++ head/cddl/contrib/opensolaris/lib/libzfs	(revision 296519)

Property changes on: head/cddl/contrib/opensolaris/lib/libzfs
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/illumos/dist/lib/libzfs:r296518
Index: head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c	(revision 296518)
+++ head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c	(revision 296519)
@@ -1,819 +1,820 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /*
  * LibZFS_Core (lzc) is intended to replace most functionality in libzfs.
  * It has the following characteristics:
  *
  *  - Thread Safe.  libzfs_core is accessible concurrently from multiple
  *  threads.  This is accomplished primarily by avoiding global data
  *  (e.g. caching).  Since it's thread-safe, there is no reason for a
  *  process to have multiple libzfs "instances".  Therefore, we store
  *  our few pieces of data (e.g. the file descriptor) in global
  *  variables.  The fd is reference-counted so that the libzfs_core
  *  library can be "initialized" multiple times (e.g. by different
  *  consumers within the same process).
  *
  *  - Committed Interface.  The libzfs_core interface will be committed,
  *  therefore consumers can compile against it and be confident that
  *  their code will continue to work on future releases of this code.
  *  Currently, the interface is Evolving (not Committed), but we intend
  *  to commit to it once it is more complete and we determine that it
  *  meets the needs of all consumers.
  *
  *  - Programatic Error Handling.  libzfs_core communicates errors with
  *  defined error numbers, and doesn't print anything to stdout/stderr.
  *
  *  - Thin Layer.  libzfs_core is a thin layer, marshaling arguments
  *  to/from the kernel ioctls.  There is generally a 1:1 correspondence
  *  between libzfs_core functions and ioctls to /dev/zfs.
  *
  *  - Clear Atomicity.  Because libzfs_core functions are generally 1:1
  *  with kernel ioctls, and kernel ioctls are general atomic, each
  *  libzfs_core function is atomic.  For example, creating multiple
  *  snapshots with a single call to lzc_snapshot() is atomic -- it
  *  can't fail with only some of the requested snapshots created, even
  *  in the event of power loss or system crash.
  *
  *  - Continued libzfs Support.  Some higher-level operations (e.g.
  *  support for "zfs send -R") are too complicated to fit the scope of
  *  libzfs_core.  This functionality will continue to live in libzfs.
  *  Where appropriate, libzfs will use the underlying atomic operations
  *  of libzfs_core.  For example, libzfs may implement "zfs send -R |
  *  zfs receive" by using individual "send one snapshot", rename,
  *  destroy, and "receive one snapshot" operations in libzfs_core.
  *  /sbin/zfs and /zbin/zpool will link with both libzfs and
  *  libzfs_core.  Other consumers should aim to use only libzfs_core,
  *  since that will be the supported, stable interface going forwards.
  */
 
 #define _IN_LIBZFS_CORE_
 
 #include <libzfs_core.h>
 #include <ctype.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <pthread.h>
 #include <sys/nvpair.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
 #include "libzfs_core_compat.h"
 #include "libzfs_compat.h"
 
 #ifdef __FreeBSD__
 extern int zfs_ioctl_version;
 #endif
 
 static int g_fd;
 static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER;
 static int g_refcount;
 
 int
 libzfs_core_init(void)
 {
 	(void) pthread_mutex_lock(&g_lock);
 	if (g_refcount == 0) {
 		g_fd = open("/dev/zfs", O_RDWR);
 		if (g_fd < 0) {
 			(void) pthread_mutex_unlock(&g_lock);
 			return (errno);
 		}
 	}
 	g_refcount++;
 	(void) pthread_mutex_unlock(&g_lock);
 
 	return (0);
 }
 
 void
 libzfs_core_fini(void)
 {
 	(void) pthread_mutex_lock(&g_lock);
 	ASSERT3S(g_refcount, >, 0);
 	g_refcount--;
 	if (g_refcount == 0)
 		(void) close(g_fd);
 	(void) pthread_mutex_unlock(&g_lock);
 }
 
 static int
 lzc_ioctl(zfs_ioc_t ioc, const char *name,
     nvlist_t *source, nvlist_t **resultp)
 {
 	zfs_cmd_t zc = { 0 };
 	int error = 0;
 	char *packed;
 #ifdef __FreeBSD__
 	nvlist_t *oldsource;
 #endif
 	size_t size;
 
 	ASSERT3S(g_refcount, >, 0);
 
 	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
 
 #ifdef __FreeBSD__
 	if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
 		zfs_ioctl_version = get_zfs_ioctl_version();
 
 	if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
 		oldsource = source;
 		error = lzc_compat_pre(&zc, &ioc, &source);
 		if (error)
 			return (error);
 	}
 #endif
 
 	packed = fnvlist_pack(source, &size);
 	zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
 	zc.zc_nvlist_src_size = size;
 
 	if (resultp != NULL) {
 		*resultp = NULL;
 		zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
 		zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
 		    malloc(zc.zc_nvlist_dst_size);
 #ifdef illumos
 		if (zc.zc_nvlist_dst == NULL) {
 #else
 		if (zc.zc_nvlist_dst == 0) {
 #endif
 			error = ENOMEM;
 			goto out;
 		}
 	}
 
 	while (ioctl(g_fd, ioc, &zc) != 0) {
 		if (errno == ENOMEM && resultp != NULL) {
 			free((void *)(uintptr_t)zc.zc_nvlist_dst);
 			zc.zc_nvlist_dst_size *= 2;
 			zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
 			    malloc(zc.zc_nvlist_dst_size);
 #ifdef illumos
 			if (zc.zc_nvlist_dst == NULL) {
 #else
 			if (zc.zc_nvlist_dst == 0) {
 #endif
 				error = ENOMEM;
 				goto out;
 			}
 		} else {
 			error = errno;
 			break;
 		}
 	}
 
 #ifdef __FreeBSD__
 	if (zfs_ioctl_version < ZFS_IOCVER_LZC)
 		lzc_compat_post(&zc, ioc);
 #endif
 	if (zc.zc_nvlist_dst_filled) {
 		*resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
 		    zc.zc_nvlist_dst_size);
 	}
 #ifdef __FreeBSD__
 	if (zfs_ioctl_version < ZFS_IOCVER_LZC)
 		lzc_compat_outnvl(&zc, ioc, resultp);
 #endif
 out:
 #ifdef __FreeBSD__
 	if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
 		if (source != oldsource)
 			nvlist_free(source);
 		source = oldsource;
 	}
 #endif
 	fnvlist_pack_free(packed, size);
 	free((void *)(uintptr_t)zc.zc_nvlist_dst);
 	return (error);
 }
 
 int
 lzc_create(const char *fsname, dmu_objset_type_t type, nvlist_t *props)
 {
 	int error;
 	nvlist_t *args = fnvlist_alloc();
 	fnvlist_add_int32(args, "type", type);
 	if (props != NULL)
 		fnvlist_add_nvlist(args, "props", props);
 	error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL);
 	nvlist_free(args);
 	return (error);
 }
 
 int
 lzc_clone(const char *fsname, const char *origin,
     nvlist_t *props)
 {
 	int error;
 	nvlist_t *args = fnvlist_alloc();
 	fnvlist_add_string(args, "origin", origin);
 	if (props != NULL)
 		fnvlist_add_nvlist(args, "props", props);
 	error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL);
 	nvlist_free(args);
 	return (error);
 }
 
 /*
  * Creates snapshots.
  *
  * The keys in the snaps nvlist are the snapshots to be created.
  * They must all be in the same pool.
  *
  * The props nvlist is properties to set.  Currently only user properties
  * are supported.  { user:prop_name -> string value }
  *
  * The returned results nvlist will have an entry for each snapshot that failed.
  * The value will be the (int32) error code.
  *
  * The return value will be 0 if all snapshots were created, otherwise it will
  * be the errno of a (unspecified) snapshot that failed.
  */
 int
 lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist)
 {
 	nvpair_t *elem;
 	nvlist_t *args;
 	int error;
 	char pool[MAXNAMELEN];
 
 	*errlist = NULL;
 
 	/* determine the pool name */
 	elem = nvlist_next_nvpair(snaps, NULL);
 	if (elem == NULL)
 		return (0);
 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 	pool[strcspn(pool, "/@")] = '\0';
 
 	args = fnvlist_alloc();
 	fnvlist_add_nvlist(args, "snaps", snaps);
 	if (props != NULL)
 		fnvlist_add_nvlist(args, "props", props);
 
 	error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist);
 	nvlist_free(args);
 
 	return (error);
 }
 
 /*
  * Destroys snapshots.
  *
  * The keys in the snaps nvlist are the snapshots to be destroyed.
  * They must all be in the same pool.
  *
  * Snapshots that do not exist will be silently ignored.
  *
  * If 'defer' is not set, and a snapshot has user holds or clones, the
  * destroy operation will fail and none of the snapshots will be
  * destroyed.
  *
  * If 'defer' is set, and a snapshot has user holds or clones, it will be
  * marked for deferred destruction, and will be destroyed when the last hold
  * or clone is removed/destroyed.
  *
  * The return value will be 0 if all snapshots were destroyed (or marked for
  * later destruction if 'defer' is set) or didn't exist to begin with.
  *
  * Otherwise the return value will be the errno of a (unspecified) snapshot
  * that failed, no snapshots will be destroyed, and the errlist will have an
  * entry for each snapshot that failed.  The value in the errlist will be
  * the (int32) error code.
  */
 int
 lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist)
 {
 	nvpair_t *elem;
 	nvlist_t *args;
 	int error;
 	char pool[MAXNAMELEN];
 
 	/* determine the pool name */
 	elem = nvlist_next_nvpair(snaps, NULL);
 	if (elem == NULL)
 		return (0);
 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 	pool[strcspn(pool, "/@")] = '\0';
 
 	args = fnvlist_alloc();
 	fnvlist_add_nvlist(args, "snaps", snaps);
 	if (defer)
 		fnvlist_add_boolean(args, "defer");
 
 	error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist);
 	nvlist_free(args);
 
 	return (error);
 }
 
 int
 lzc_snaprange_space(const char *firstsnap, const char *lastsnap,
     uint64_t *usedp)
 {
 	nvlist_t *args;
 	nvlist_t *result;
 	int err;
 	char fs[MAXNAMELEN];
 	char *atp;
 
 	/* determine the fs name */
 	(void) strlcpy(fs, firstsnap, sizeof (fs));
 	atp = strchr(fs, '@');
 	if (atp == NULL)
 		return (EINVAL);
 	*atp = '\0';
 
 	args = fnvlist_alloc();
 	fnvlist_add_string(args, "firstsnap", firstsnap);
 
 	err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result);
 	nvlist_free(args);
 	if (err == 0)
 		*usedp = fnvlist_lookup_uint64(result, "used");
 	fnvlist_free(result);
 
 	return (err);
 }
 
 boolean_t
 lzc_exists(const char *dataset)
 {
 	/*
 	 * The objset_stats ioctl is still legacy, so we need to construct our
 	 * own zfs_cmd_t rather than using zfsc_ioctl().
 	 */
 	zfs_cmd_t zc = { 0 };
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 	return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0);
 }
 
 /*
  * Create "user holds" on snapshots.  If there is a hold on a snapshot,
  * the snapshot can not be destroyed.  (However, it can be marked for deletion
  * by lzc_destroy_snaps(defer=B_TRUE).)
  *
  * The keys in the nvlist are snapshot names.
  * The snapshots must all be in the same pool.
  * The value is the name of the hold (string type).
  *
  * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
  * In this case, when the cleanup_fd is closed (including on process
  * termination), the holds will be released.  If the system is shut down
  * uncleanly, the holds will be released when the pool is next opened
  * or imported.
  *
  * Holds for snapshots which don't exist will be skipped and have an entry
  * added to errlist, but will not cause an overall failure.
  *
  * The return value will be 0 if all holds, for snapshots that existed,
  * were succesfully created.
  *
  * Otherwise the return value will be the errno of a (unspecified) hold that
  * failed and no holds will be created.
  *
  * In all cases the errlist will have an entry for each hold that failed
  * (name = snapshot), with its value being the error code (int32).
  */
 int
 lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist)
 {
 	char pool[MAXNAMELEN];
 	nvlist_t *args;
 	nvpair_t *elem;
 	int error;
 
 	/* determine the pool name */
 	elem = nvlist_next_nvpair(holds, NULL);
 	if (elem == NULL)
 		return (0);
 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 	pool[strcspn(pool, "/@")] = '\0';
 
 	args = fnvlist_alloc();
 	fnvlist_add_nvlist(args, "holds", holds);
 	if (cleanup_fd != -1)
 		fnvlist_add_int32(args, "cleanup_fd", cleanup_fd);
 
 	error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist);
 	nvlist_free(args);
 	return (error);
 }
 
 /*
  * Release "user holds" on snapshots.  If the snapshot has been marked for
  * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
  * any clones, and all the user holds are removed, then the snapshot will be
  * destroyed.
  *
  * The keys in the nvlist are snapshot names.
  * The snapshots must all be in the same pool.
  * The value is a nvlist whose keys are the holds to remove.
  *
  * Holds which failed to release because they didn't exist will have an entry
  * added to errlist, but will not cause an overall failure.
  *
  * The return value will be 0 if the nvl holds was empty or all holds that
  * existed, were successfully removed.
  *
  * Otherwise the return value will be the errno of a (unspecified) hold that
  * failed to release and no holds will be released.
  *
  * In all cases the errlist will have an entry for each hold that failed to
  * to release.
  */
 int
 lzc_release(nvlist_t *holds, nvlist_t **errlist)
 {
 	char pool[MAXNAMELEN];
 	nvpair_t *elem;
 
 	/* determine the pool name */
 	elem = nvlist_next_nvpair(holds, NULL);
 	if (elem == NULL)
 		return (0);
 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 	pool[strcspn(pool, "/@")] = '\0';
 
 	return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist));
 }
 
 /*
  * Retrieve list of user holds on the specified snapshot.
  *
  * On success, *holdsp will be set to a nvlist which the caller must free.
  * The keys are the names of the holds, and the value is the creation time
  * of the hold (uint64) in seconds since the epoch.
  */
 int
 lzc_get_holds(const char *snapname, nvlist_t **holdsp)
 {
 	int error;
 	nvlist_t *innvl = fnvlist_alloc();
 	error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp);
 	fnvlist_free(innvl);
 	return (error);
 }
 
 /*
  * Generate a zfs send stream for the specified snapshot and write it to
  * the specified file descriptor.
  *
  * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
  *
  * If "from" is NULL, a full (non-incremental) stream will be sent.
  * If "from" is non-NULL, it must be the full name of a snapshot or
  * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or
  * "pool/fs#earlier_bmark").  If non-NULL, the specified snapshot or
  * bookmark must represent an earlier point in the history of "snapname").
  * It can be an earlier snapshot in the same filesystem or zvol as "snapname",
  * or it can be the origin of "snapname"'s filesystem, or an earlier
  * snapshot in the origin, etc.
  *
  * "fd" is the file descriptor to write the send stream to.
  *
  * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
  * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
  * records with drr_blksz > 128K.
  *
  * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
  * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
  * which the receiving system must support (as indicated by support
  * for the "embedded_data" feature).
  */
 int
 lzc_send(const char *snapname, const char *from, int fd,
     enum lzc_send_flags flags)
 {
 	return (lzc_send_resume(snapname, from, fd, flags, 0, 0));
 }
 
 int
 lzc_send_resume(const char *snapname, const char *from, int fd,
     enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff)
 {
 	nvlist_t *args;
 	int err;
 
 	args = fnvlist_alloc();
 	fnvlist_add_int32(args, "fd", fd);
 	if (from != NULL)
 		fnvlist_add_string(args, "fromsnap", from);
 	if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
 		fnvlist_add_boolean(args, "largeblockok");
 	if (flags & LZC_SEND_FLAG_EMBED_DATA)
 		fnvlist_add_boolean(args, "embedok");
 	if (resumeobj != 0 || resumeoff != 0) {
 		fnvlist_add_uint64(args, "resume_object", resumeobj);
 		fnvlist_add_uint64(args, "resume_offset", resumeoff);
 	}
 	err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
 	nvlist_free(args);
 	return (err);
 }
 
 /*
  * "from" can be NULL, a snapshot, or a bookmark.
  *
  * If from is NULL, a full (non-incremental) stream will be estimated.  This
  * is calculated very efficiently.
  *
  * If from is a snapshot, lzc_send_space uses the deadlists attached to
  * each snapshot to efficiently estimate the stream size.
  *
  * If from is a bookmark, the indirect blocks in the destination snapshot
  * are traversed, looking for blocks with a birth time since the creation TXG of
  * the snapshot this bookmark was created from.  This will result in
  * significantly more I/O and be less efficient than a send space estimation on
  * an equivalent snapshot.
  */
 int
 lzc_send_space(const char *snapname, const char *from, uint64_t *spacep)
 {
 	nvlist_t *args;
 	nvlist_t *result;
 	int err;
 
 	args = fnvlist_alloc();
 	if (from != NULL)
 		fnvlist_add_string(args, "from", from);
 	err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
 	nvlist_free(args);
 	if (err == 0)
 		*spacep = fnvlist_lookup_uint64(result, "space");
 	nvlist_free(result);
 	return (err);
 }
 
 static int
 recv_read(int fd, void *buf, int ilen)
 {
 	char *cp = buf;
 	int rv;
 	int len = ilen;
 
 	do {
 		rv = read(fd, cp, len);
 		cp += rv;
 		len -= rv;
 	} while (rv > 0);
 
 	if (rv < 0 || len != 0)
 		return (EIO);
 
 	return (0);
 }
 
 static int
 lzc_receive_impl(const char *snapname, nvlist_t *props, const char *origin,
     boolean_t force, boolean_t resumable, int fd)
 {
 	/*
 	 * The receive ioctl is still legacy, so we need to construct our own
 	 * zfs_cmd_t rather than using zfsc_ioctl().
 	 */
 	zfs_cmd_t zc = { 0 };
 	char *atp;
 	char *packed = NULL;
 	size_t size;
 	int error;
 
 	ASSERT3S(g_refcount, >, 0);
 
 	/* zc_name is name of containing filesystem */
 	(void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name));
 	atp = strchr(zc.zc_name, '@');
 	if (atp == NULL)
 		return (EINVAL);
 	*atp = '\0';
 
 	/* if the fs does not exist, try its parent. */
 	if (!lzc_exists(zc.zc_name)) {
 		char *slashp = strrchr(zc.zc_name, '/');
 		if (slashp == NULL)
 			return (ENOENT);
 		*slashp = '\0';
 
 	}
 
 	/* zc_value is full name of the snapshot to create */
 	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
 
 	if (props != NULL) {
 		/* zc_nvlist_src is props to set */
 		packed = fnvlist_pack(props, &size);
 		zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
 		zc.zc_nvlist_src_size = size;
 	}
 
 	/* zc_string is name of clone origin (if DRR_FLAG_CLONE) */
 	if (origin != NULL)
 		(void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string));
 
 	/* zc_begin_record is non-byteswapped BEGIN record */
 	error = recv_read(fd, &zc.zc_begin_record, sizeof (zc.zc_begin_record));
 	if (error != 0)
 		goto out;
 
 	/* zc_cookie is fd to read from */
 	zc.zc_cookie = fd;
 
 	/* zc guid is force flag */
 	zc.zc_guid = force;
 
 	zc.zc_resumable = resumable;
 
 	/* zc_cleanup_fd is unused */
 	zc.zc_cleanup_fd = -1;
 
 	error = ioctl(g_fd, ZFS_IOC_RECV, &zc);
 	if (error != 0)
 		error = errno;
 
 out:
 	if (packed != NULL)
 		fnvlist_pack_free(packed, size);
 	free((void*)(uintptr_t)zc.zc_nvlist_dst);
 	return (error);
 }
 
 /*
  * The simplest receive case: receive from the specified fd, creating the
  * specified snapshot.  Apply the specified properties as "received" properties
  * (which can be overridden by locally-set properties).  If the stream is a
  * clone, its origin snapshot must be specified by 'origin'.  The 'force'
  * flag will cause the target filesystem to be rolled back or destroyed if
  * necessary to receive.
  *
  * Return 0 on success or an errno on failure.
  *
  * Note: this interface does not work on dedup'd streams
  * (those with DMU_BACKUP_FEATURE_DEDUP).
  */
 int
 lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
     boolean_t force, int fd)
 {
 	return (lzc_receive_impl(snapname, props, origin, force, B_FALSE, fd));
 }
 
 /*
  * Like lzc_receive, but if the receive fails due to premature stream
  * termination, the intermediate state will be preserved on disk.  In this
  * case, ECKSUM will be returned.  The receive may subsequently be resumed
  * with a resuming send stream generated by lzc_send_resume().
  */
 int
 lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
     boolean_t force, int fd)
 {
 	return (lzc_receive_impl(snapname, props, origin, force, B_TRUE, fd));
 }
 
 /*
  * Roll back this filesystem or volume to its most recent snapshot.
  * If snapnamebuf is not NULL, it will be filled in with the name
  * of the most recent snapshot.
  *
  * Return 0 on success or an errno on failure.
  */
 int
 lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen)
 {
 	nvlist_t *args;
 	nvlist_t *result;
 	int err;
 
 	args = fnvlist_alloc();
 	err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
 	nvlist_free(args);
 	if (err == 0 && snapnamebuf != NULL) {
 		const char *snapname = fnvlist_lookup_string(result, "target");
 		(void) strlcpy(snapnamebuf, snapname, snapnamelen);
 	}
 	return (err);
 }
 
 /*
  * Creates bookmarks.
  *
  * The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to
  * the name of the snapshot (e.g. "pool/fs@snap").  All the bookmarks and
  * snapshots must be in the same pool.
  *
  * The returned results nvlist will have an entry for each bookmark that failed.
  * The value will be the (int32) error code.
  *
  * The return value will be 0 if all bookmarks were created, otherwise it will
  * be the errno of a (undetermined) bookmarks that failed.
  */
 int
 lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist)
 {
 	nvpair_t *elem;
 	int error;
 	char pool[MAXNAMELEN];
 
 	/* determine the pool name */
 	elem = nvlist_next_nvpair(bookmarks, NULL);
 	if (elem == NULL)
 		return (0);
 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 	pool[strcspn(pool, "/#")] = '\0';
 
 	error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist);
 
 	return (error);
 }
 
 /*
  * Retrieve bookmarks.
  *
  * Retrieve the list of bookmarks for the given file system. The props
  * parameter is an nvlist of property names (with no values) that will be
  * returned for each bookmark.
  *
  * The following are valid properties on bookmarks, all of which are numbers
  * (represented as uint64 in the nvlist)
  *
  * "guid" - globally unique identifier of the snapshot it refers to
  * "createtxg" - txg when the snapshot it refers to was created
  * "creation" - timestamp when the snapshot it refers to was created
  *
  * The format of the returned nvlist as follows:
  * <short name of bookmark> -> {
  *     <name of property> -> {
  *         "value" -> uint64
  *     }
  *  }
  */
 int
 lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks)
 {
 	return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks));
 }
 
 /*
  * Destroys bookmarks.
  *
  * The keys in the bmarks nvlist are the bookmarks to be destroyed.
  * They must all be in the same pool.  Bookmarks are specified as
  * <fs>#<bmark>.
  *
  * Bookmarks that do not exist will be silently ignored.
  *
  * The return value will be 0 if all bookmarks that existed were destroyed.
  *
  * Otherwise the return value will be the errno of a (undetermined) bookmark
  * that failed, no bookmarks will be destroyed, and the errlist will have an
  * entry for each bookmarks that failed.  The value in the errlist will be
  * the (int32) error code.
  */
 int
 lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist)
 {
 	nvpair_t *elem;
 	int error;
 	char pool[MAXNAMELEN];
 
 	/* determine the pool name */
 	elem = nvlist_next_nvpair(bmarks, NULL);
 	if (elem == NULL)
 		return (0);
 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 	pool[strcspn(pool, "/#")] = '\0';
 
 	error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist);
 
 	return (error);
 }
Index: head/cddl/contrib/opensolaris
===================================================================
--- head/cddl/contrib/opensolaris	(revision 296518)
+++ head/cddl/contrib/opensolaris	(revision 296519)

Property changes on: head/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/illumos/dist:r296518
Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c	(revision 296519)
@@ -1,249 +1,250 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifdef _KERNEL
 #include <sys/systm.h>
 #else
 #include <errno.h>
 #include <string.h>
 #endif
 #include <sys/debug.h>
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 #include "zfeature_common.h"
 
 /*
  * Set to disable all feature checks while opening pools, allowing pools with
  * unsupported features to be opened. Set for testing only.
  */
 boolean_t zfeature_checks_disable = B_FALSE;
 
 zfeature_info_t spa_feature_table[SPA_FEATURES];
 
 /*
  * Valid characters for feature guids. This list is mainly for aesthetic
  * purposes and could be expanded in the future. There are different allowed
  * characters in the guids reverse dns portion (before the colon) and its
  * short name (after the colon).
  */
 static int
 valid_char(char c, boolean_t after_colon)
 {
 	return ((c >= 'a' && c <= 'z') ||
 	    (c >= '0' && c <= '9') ||
 	    (after_colon && c == '_') ||
 	    (!after_colon && (c == '.' || c == '-')));
 }
 
 /*
  * Every feature guid must contain exactly one colon which separates a reverse
  * dns organization name from the feature's "short" name (e.g.
  * "com.company:feature_name").
  */
 boolean_t
 zfeature_is_valid_guid(const char *name)
 {
 	int i;
 	boolean_t has_colon = B_FALSE;
 
 	i = 0;
 	while (name[i] != '\0') {
 		char c = name[i++];
 		if (c == ':') {
 			if (has_colon)
 				return (B_FALSE);
 			has_colon = B_TRUE;
 			continue;
 		}
 		if (!valid_char(c, has_colon))
 			return (B_FALSE);
 	}
 
 	return (has_colon);
 }
 
 boolean_t
 zfeature_is_supported(const char *guid)
 {
 	if (zfeature_checks_disable)
 		return (B_TRUE);
 
 	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 		zfeature_info_t *feature = &spa_feature_table[i];
 		if (strcmp(guid, feature->fi_guid) == 0)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 int
 zfeature_lookup_name(const char *name, spa_feature_t *res)
 {
 	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 		zfeature_info_t *feature = &spa_feature_table[i];
 		if (strcmp(name, feature->fi_uname) == 0) {
 			if (res != NULL)
 				*res = i;
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 boolean_t
 zfeature_depends_on(spa_feature_t fid, spa_feature_t check)
 {
 	zfeature_info_t *feature = &spa_feature_table[fid];
 
 	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
 		if (feature->fi_depends[i] == check)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static void
 zfeature_register(spa_feature_t fid, const char *guid, const char *name,
     const char *desc, zfeature_flags_t flags, const spa_feature_t *deps)
 {
 	zfeature_info_t *feature = &spa_feature_table[fid];
 	static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
 
 	ASSERT(name != NULL);
 	ASSERT(desc != NULL);
 	ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 ||
 	    (flags & ZFEATURE_FLAG_MOS) == 0);
 	ASSERT3U(fid, <, SPA_FEATURES);
 	ASSERT(zfeature_is_valid_guid(guid));
 
 	if (deps == NULL)
 		deps = nodeps;
 
 	feature->fi_feature = fid;
 	feature->fi_guid = guid;
 	feature->fi_uname = name;
 	feature->fi_desc = desc;
 	feature->fi_flags = flags;
 	feature->fi_depends = deps;
 }
 
 void
 zpool_feature_init(void)
 {
 	zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
 	    "com.delphix:async_destroy", "async_destroy",
 	    "Destroy filesystems asynchronously.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
 	zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
 	    "com.delphix:empty_bpobj", "empty_bpobj",
 	    "Snapshots use less space.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
 	zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
 	    "org.illumos:lz4_compress", "lz4_compress",
 	    "LZ4 compression algorithm support.",
 	    ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL);
 
 	zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
 	    "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
 	    "Crash dumps to multiple vdev pools.",
 	    0, NULL);
 
 	zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
 	    "com.delphix:spacemap_histogram", "spacemap_histogram",
 	    "Spacemaps maintain space histograms.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
 	zfeature_register(SPA_FEATURE_ENABLED_TXG,
 	    "com.delphix:enabled_txg", "enabled_txg",
 	    "Record txg at which a feature is enabled",
 	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
 	static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
 	    SPA_FEATURE_NONE };
 	zfeature_register(SPA_FEATURE_HOLE_BIRTH,
 	    "com.delphix:hole_birth", "hole_birth",
 	    "Retain hole birth txg for more precise zfs send",
 	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
 	    hole_birth_deps);
 
 	zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
 	    "com.delphix:extensible_dataset", "extensible_dataset",
 	    "Enhanced dataset functionality, used by other features.",
 	    0, NULL);
 
 	static const spa_feature_t bookmarks_deps[] = {
 		SPA_FEATURE_EXTENSIBLE_DATASET,
 		SPA_FEATURE_NONE
 	};
 	zfeature_register(SPA_FEATURE_BOOKMARKS,
 	    "com.delphix:bookmarks", "bookmarks",
 	    "\"zfs bookmark\" command",
 	    ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps);
 
 	static const spa_feature_t filesystem_limits_deps[] = {
 	    SPA_FEATURE_EXTENSIBLE_DATASET,
 	    SPA_FEATURE_NONE
 	};
 	zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
 	    "com.joyent:filesystem_limits", "filesystem_limits",
 	    "Filesystem and snapshot limits.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps);
 
 	zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
 	    "com.delphix:embedded_data", "embedded_data",
 	    "Blocks which compress very well use even less space.",
 	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
 	    NULL);
 
 	static const spa_feature_t large_blocks_deps[] = {
 		SPA_FEATURE_EXTENSIBLE_DATASET,
 		SPA_FEATURE_NONE
 	};
 	zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
 	    "org.open-zfs:large_blocks", "large_blocks",
 	    "Support for blocks larger than 128KB.",
 	    ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
 
 #ifdef illumos
 	zfeature_register(SPA_FEATURE_SHA512,
 	    "org.illumos:sha512", "sha512",
 	    "SHA-512/256 hash algorithm.",
 	    ZFEATURE_FLAG_PER_DATASET, NULL);
 	zfeature_register(SPA_FEATURE_SKEIN,
 	    "org.illumos:skein", "skein",
 	    "Skein hash algorithm.",
 	    ZFEATURE_FLAG_PER_DATASET, NULL);
 	zfeature_register(SPA_FEATURE_EDONR,
 	    "org.illumos:edonr", "edonr",
 	    "Edon-R hash algorithm.",
 	    ZFEATURE_FLAG_PER_DATASET, NULL);
 #endif
 }
Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h	(revision 296519)
@@ -1,103 +1,104 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef _ZFEATURE_COMMON_H
 #define	_ZFEATURE_COMMON_H
 
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct zfeature_info;
 
 typedef enum spa_feature {
 	SPA_FEATURE_NONE = -1,
 	SPA_FEATURE_ASYNC_DESTROY,
 	SPA_FEATURE_EMPTY_BPOBJ,
 	SPA_FEATURE_LZ4_COMPRESS,
 	SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
 	SPA_FEATURE_SPACEMAP_HISTOGRAM,
 	SPA_FEATURE_ENABLED_TXG,
 	SPA_FEATURE_HOLE_BIRTH,
 	SPA_FEATURE_EXTENSIBLE_DATASET,
 	SPA_FEATURE_EMBEDDED_DATA,
 	SPA_FEATURE_BOOKMARKS,
 	SPA_FEATURE_FS_SS_LIMIT,
 	SPA_FEATURE_LARGE_BLOCKS,
 #ifdef illumos
 	SPA_FEATURE_SHA512,
 	SPA_FEATURE_SKEIN,
 	SPA_FEATURE_EDONR,
 #endif
 	SPA_FEATURES
 } spa_feature_t;
 
 #define	SPA_FEATURE_DISABLED	(-1ULL)
 
 typedef enum zfeature_flags {
 	/* Can open pool readonly even if this feature is not supported. */
 	ZFEATURE_FLAG_READONLY_COMPAT =		(1 << 0),
 	/* Is this feature necessary to read the MOS? */
 	ZFEATURE_FLAG_MOS =			(1 << 1),
 	/* Activate this feature at the same time it is enabled. */
 	ZFEATURE_FLAG_ACTIVATE_ON_ENABLE =	(1 << 2),
 	/* Each dataset has a field set if it has ever used this feature. */
 	ZFEATURE_FLAG_PER_DATASET =		(1 << 3)
 } zfeature_flags_t;
 
 typedef struct zfeature_info {
 	spa_feature_t fi_feature;
 	const char *fi_uname;	/* User-facing feature name */
 	const char *fi_guid;	/* On-disk feature identifier */
 	const char *fi_desc;	/* Feature description */
 	zfeature_flags_t fi_flags;
 	/* array of dependencies, terminated by SPA_FEATURE_NONE */
 	const spa_feature_t *fi_depends;
 } zfeature_info_t;
 
 typedef int (zfeature_func_t)(zfeature_info_t *, void *);
 
 #define	ZFS_FEATURE_DEBUG
 
 extern zfeature_info_t spa_feature_table[SPA_FEATURES];
 
 extern boolean_t zfeature_is_valid_guid(const char *);
 
 extern boolean_t zfeature_is_supported(const char *);
 extern int zfeature_lookup_name(const char *, spa_feature_t *);
 extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
 
 extern void zpool_feature_init(void);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _ZFEATURE_COMMON_H */
Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c	(revision 296519)
@@ -1,687 +1,688 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/u8_textprep.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_znode.h>
 
 #include "zfs_prop.h"
 #include "zfs_deleg.h"
 
 #if defined(_KERNEL)
 #include <sys/systm.h>
 #else
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #endif
 
 static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
 
 /* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
 const char *zfs_userquota_prop_prefixes[] = {
 	"userused@",
 	"userquota@",
 	"groupused@",
 	"groupquota@"
 };
 
 zprop_desc_t *
 zfs_prop_get_table(void)
 {
 	return (zfs_prop_table);
 }
 
 void
 zfs_prop_init(void)
 {
 	static zprop_index_t checksum_table[] = {
 		{ "on",		ZIO_CHECKSUM_ON },
 		{ "off",	ZIO_CHECKSUM_OFF },
 		{ "fletcher2",	ZIO_CHECKSUM_FLETCHER_2 },
 		{ "fletcher4",	ZIO_CHECKSUM_FLETCHER_4 },
 		{ "sha256",	ZIO_CHECKSUM_SHA256 },
 		{ "noparity",	ZIO_CHECKSUM_NOPARITY },
 #ifdef illumos
 		{ "sha512",	ZIO_CHECKSUM_SHA512 },
 		{ "skein",	ZIO_CHECKSUM_SKEIN },
 		{ "edonr",	ZIO_CHECKSUM_EDONR },
 #endif
 		{ NULL }
 	};
 
 	static zprop_index_t dedup_table[] = {
 		{ "on",		ZIO_CHECKSUM_ON },
 		{ "off",	ZIO_CHECKSUM_OFF },
 		{ "verify",	ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
 		{ "sha256",	ZIO_CHECKSUM_SHA256 },
 		{ "sha256,verify",
 				ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
 #ifdef illumos
 		{ "sha512",	ZIO_CHECKSUM_SHA512 },
 		{ "sha512,verify",
 				ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY },
 		{ "skein",	ZIO_CHECKSUM_SKEIN },
 		{ "skein,verify",
 				ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
 		{ "edonr,verify",
 				ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
 #endif
 		{ NULL }
 	};
 
 	static zprop_index_t compress_table[] = {
 		{ "on",		ZIO_COMPRESS_ON },
 		{ "off",	ZIO_COMPRESS_OFF },
 		{ "lzjb",	ZIO_COMPRESS_LZJB },
 		{ "gzip",	ZIO_COMPRESS_GZIP_6 },	/* gzip default */
 		{ "gzip-1",	ZIO_COMPRESS_GZIP_1 },
 		{ "gzip-2",	ZIO_COMPRESS_GZIP_2 },
 		{ "gzip-3",	ZIO_COMPRESS_GZIP_3 },
 		{ "gzip-4",	ZIO_COMPRESS_GZIP_4 },
 		{ "gzip-5",	ZIO_COMPRESS_GZIP_5 },
 		{ "gzip-6",	ZIO_COMPRESS_GZIP_6 },
 		{ "gzip-7",	ZIO_COMPRESS_GZIP_7 },
 		{ "gzip-8",	ZIO_COMPRESS_GZIP_8 },
 		{ "gzip-9",	ZIO_COMPRESS_GZIP_9 },
 		{ "zle",	ZIO_COMPRESS_ZLE },
 		{ "lz4",	ZIO_COMPRESS_LZ4 },
 		{ NULL }
 	};
 
 	static zprop_index_t snapdir_table[] = {
 		{ "hidden",	ZFS_SNAPDIR_HIDDEN },
 		{ "visible",	ZFS_SNAPDIR_VISIBLE },
 		{ NULL }
 	};
 
 	static zprop_index_t acl_mode_table[] = {
 		{ "discard",	ZFS_ACL_DISCARD },
 		{ "groupmask",	ZFS_ACL_GROUPMASK },
 		{ "passthrough", ZFS_ACL_PASSTHROUGH },
 		{ "restricted", ZFS_ACL_RESTRICTED },
 		{ NULL }
 	};
 
 	static zprop_index_t acl_inherit_table[] = {
 		{ "discard",	ZFS_ACL_DISCARD },
 		{ "noallow",	ZFS_ACL_NOALLOW },
 		{ "restricted",	ZFS_ACL_RESTRICTED },
 		{ "passthrough", ZFS_ACL_PASSTHROUGH },
 		{ "secure",	ZFS_ACL_RESTRICTED }, /* bkwrd compatability */
 		{ "passthrough-x", ZFS_ACL_PASSTHROUGH_X },
 		{ NULL }
 	};
 
 	static zprop_index_t case_table[] = {
 		{ "sensitive",		ZFS_CASE_SENSITIVE },
 		{ "insensitive",	ZFS_CASE_INSENSITIVE },
 		{ "mixed",		ZFS_CASE_MIXED },
 		{ NULL }
 	};
 
 	static zprop_index_t copies_table[] = {
 		{ "1",		1 },
 		{ "2",		2 },
 		{ "3",		3 },
 		{ NULL }
 	};
 
 	/*
 	 * Use the unique flags we have to send to u8_strcmp() and/or
 	 * u8_textprep() to represent the various normalization property
 	 * values.
 	 */
 	static zprop_index_t normalize_table[] = {
 		{ "none",	0 },
 		{ "formD",	U8_TEXTPREP_NFD },
 		{ "formKC",	U8_TEXTPREP_NFKC },
 		{ "formC",	U8_TEXTPREP_NFC },
 		{ "formKD",	U8_TEXTPREP_NFKD },
 		{ NULL }
 	};
 
 	static zprop_index_t version_table[] = {
 		{ "1",		1 },
 		{ "2",		2 },
 		{ "3",		3 },
 		{ "4",		4 },
 		{ "5",		5 },
 		{ "current",	ZPL_VERSION },
 		{ NULL }
 	};
 
 	static zprop_index_t boolean_table[] = {
 		{ "off",	0 },
 		{ "on",		1 },
 		{ NULL }
 	};
 
 	static zprop_index_t logbias_table[] = {
 		{ "latency",	ZFS_LOGBIAS_LATENCY },
 		{ "throughput",	ZFS_LOGBIAS_THROUGHPUT },
 		{ NULL }
 	};
 
 	static zprop_index_t canmount_table[] = {
 		{ "off",	ZFS_CANMOUNT_OFF },
 		{ "on",		ZFS_CANMOUNT_ON },
 		{ "noauto",	ZFS_CANMOUNT_NOAUTO },
 		{ NULL }
 	};
 
 	static zprop_index_t cache_table[] = {
 		{ "none",	ZFS_CACHE_NONE },
 		{ "metadata",	ZFS_CACHE_METADATA },
 		{ "all",	ZFS_CACHE_ALL },
 		{ NULL }
 	};
 
 	static zprop_index_t sync_table[] = {
 		{ "standard",	ZFS_SYNC_STANDARD },
 		{ "always",	ZFS_SYNC_ALWAYS },
 		{ "disabled",	ZFS_SYNC_DISABLED },
 		{ NULL }
 	};
 
 	static zprop_index_t volmode_table[] = {
 		{ "default",	ZFS_VOLMODE_DEFAULT },
 		{ "geom",	ZFS_VOLMODE_GEOM },
 		{ "dev",	ZFS_VOLMODE_DEV },
 		{ "none",	ZFS_VOLMODE_NONE },
 		{ NULL }
 	};
 
 	static zprop_index_t redundant_metadata_table[] = {
 		{ "all",	ZFS_REDUNDANT_METADATA_ALL },
 		{ "most",	ZFS_REDUNDANT_METADATA_MOST },
 		{ NULL }
 	};
 
 	/* inherit index properties */
 	zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
 	    ZFS_REDUNDANT_METADATA_ALL,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "all | most", "REDUND_MD",
 	    redundant_metadata_table);
 	zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "standard | always | disabled", "SYNC",
 	    sync_table);
 	zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
 	    ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
 	    ZFS_TYPE_VOLUME,
 	    "on | off | fletcher2 | fletcher4 | sha256 | sha512 | "
 	    "skein | edonr", "CHECKSUM", checksum_table);
 	zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "on | off | verify | sha256[,verify], sha512[,verify], "
 	    "skein[,verify], edonr,verify", "DEDUP", dedup_table);
 	zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
 	    ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4",
 	    "COMPRESS", compress_table);
 	zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "hidden | visible", "SNAPDIR", snapdir_table);
 	zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "discard | groupmask | passthrough | restricted", "ACLMODE",
 	    acl_mode_table);
 	zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
 	    ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "discard | noallow | restricted | passthrough | passthrough-x",
 	    "ACLINHERIT", acl_inherit_table);
 	zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "1 | 2 | 3", "COPIES", copies_table);
 	zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
 	    ZFS_CACHE_ALL, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
 	    "all | none | metadata", "PRIMARYCACHE", cache_table);
 	zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
 	    ZFS_CACHE_ALL, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
 	    "all | none | metadata", "SECONDARYCACHE", cache_table);
 	zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "latency | throughput", "LOGBIAS", logbias_table);
 	zprop_register_index(ZFS_PROP_VOLMODE, "volmode",
 	    ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
 	    "default | geom | dev | none", "VOLMODE", volmode_table);
 
 	/* inherit index (boolean) properties */
 	zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
 	zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
 	    boolean_table);
 	zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
 	    boolean_table);
 	zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
 	    boolean_table);
 	zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
 	    boolean_table);
 	zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
 	zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
 	    boolean_table);
 	zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
 	    boolean_table);
 	zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
 	    boolean_table);
 
 	/* default index properties */
 	zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
 	    "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table);
 	zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
 	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
 	    "CANMOUNT", canmount_table);
 
 	/* readonly index (boolean) properties */
 	zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
 	zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
 	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
 	    boolean_table);
 
 	/* set once index properties */
 	zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
 	    PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
 	    "none | formC | formD | formKC | formKD", "NORMALIZATION",
 	    normalize_table);
 	zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
 	    ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
 	    ZFS_TYPE_SNAPSHOT,
 	    "sensitive | insensitive | mixed", "CASE", case_table);
 
 	/* set once index (boolean) properties */
 	zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
 	    "on | off", "UTF8ONLY", boolean_table);
 
 	/* string properties */
 	zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
 	zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY,
 	    ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES");
 	zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
 	    "MOUNTPOINT");
 	zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options",
 	    "SHARENFS");
 	zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
 	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
 	    "filesystem | volume | snapshot | bookmark", "TYPE");
 	zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "on | off | sharemgr(1M) options", "SHARESMB");
 	zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
 	    ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
 	    "<sensitivity label>", "MLSLABEL");
 	zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	    "receive_resume_token",
 	    NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "<string token>", "RESUMETOK");
 
 	/* readonly number properties */
 	zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
 	    ZFS_TYPE_DATASET, "<size>", "USED");
 	zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
 	zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
 	zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
 	    PROP_READONLY, ZFS_TYPE_DATASET,
 	    "<1.00x or higher if compressed>", "RATIO");
 	zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
 	    PROP_READONLY, ZFS_TYPE_DATASET,
 	    "<1.00x or higher if compressed>", "REFRATIO");
 	zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
 	    ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
 	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK");
 	zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
 	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
 	    "USEDSNAP");
 	zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
 	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
 	    "USEDDS");
 	zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
 	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
 	    "USEDCHILD");
 	zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
 	    PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
 	zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
 	    ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
 	zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
 	    ZFS_TYPE_DATASET, "<size>", "WRITTEN");
 	zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LUSED");
 	zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
 	    0, PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LREFER");
 
 	/* default number properties */
 	zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
 	zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
 	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "<size> | none", "RESERV");
 	zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
 	    ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
 	zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
 	zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
 	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "<size> | none", "REFRESERV");
 	zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
 	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
 	    "<count> | none", "FSLIMIT");
 	zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
 	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "<count> | none", "SSLIMIT");
 	zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
 	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
 	    "<count>", "FSCOUNT");
 	zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
 	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "<count>", "SSCOUNT");
 
 	/* inherit number properties */
 	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
 	    SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
 
 	/* hidden properties */
 	zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "CREATETXG");
 	zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
 	zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME");
 	zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
 	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
 	zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
 	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
 	    "STMF_SBD_LU");
 	zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "GUID");
 	zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
 	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
 	    "USERACCOUNTING");
 	zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
 	zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
 	zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent",
 	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
 	zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP");
 
 	/* oddball properties */
 	zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
 	    NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
 	    "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
 }
 
 boolean_t
 zfs_prop_delegatable(zfs_prop_t prop)
 {
 	zprop_desc_t *pd = &zfs_prop_table[prop];
 
 	/* The mlslabel property is never delegatable. */
 	if (prop == ZFS_PROP_MLSLABEL)
 		return (B_FALSE);
 
 	return (pd->pd_attr != PROP_READONLY);
 }
 
 /*
  * Given a zfs dataset property name, returns the corresponding property ID.
  */
 zfs_prop_t
 zfs_name_to_prop(const char *propname)
 {
 	return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
 }
 
 /*
  * For user property names, we allow all lowercase alphanumeric characters, plus
  * a few useful punctuation characters.
  */
 static int
 valid_char(char c)
 {
 	return ((c >= 'a' && c <= 'z') ||
 	    (c >= '0' && c <= '9') ||
 	    c == '-' || c == '_' || c == '.' || c == ':');
 }
 
 /*
  * Returns true if this is a valid user-defined property (one with a ':').
  */
 boolean_t
 zfs_prop_user(const char *name)
 {
 	int i;
 	char c;
 	boolean_t foundsep = B_FALSE;
 
 	for (i = 0; i < strlen(name); i++) {
 		c = name[i];
 		if (!valid_char(c))
 			return (B_FALSE);
 		if (c == ':')
 			foundsep = B_TRUE;
 	}
 
 	if (!foundsep)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Returns true if this is a valid userspace-type property (one with a '@').
  * Note that after the @, any character is valid (eg, another @, for SID
  * user@domain).
  */
 boolean_t
 zfs_prop_userquota(const char *name)
 {
 	zfs_userquota_prop_t prop;
 
 	for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
 		if (strncmp(name, zfs_userquota_prop_prefixes[prop],
 		    strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Returns true if this is a valid written@ property.
  * Note that after the @, any character is valid (eg, another @, for
  * written@pool/fs@origin).
  */
 boolean_t
 zfs_prop_written(const char *name)
 {
 	static const char *prefix = "written@";
 	return (strncmp(name, prefix, strlen(prefix)) == 0);
 }
 
 /*
  * Tables of index types, plus functions to convert between the user view
  * (strings) and internal representation (uint64_t).
  */
 int
 zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
 {
 	return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
 }
 
 int
 zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
 {
 	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
 }
 
 uint64_t
 zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
 {
 	return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
 }
 
 /*
  * Returns TRUE if the property applies to any of the given dataset types.
  */
 boolean_t
 zfs_prop_valid_for_type(int prop, zfs_type_t types)
 {
 	return (zprop_valid_for_type(prop, types));
 }
 
 zprop_type_t
 zfs_prop_get_type(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_proptype);
 }
 
 /*
  * Returns TRUE if the property is readonly.
  */
 boolean_t
 zfs_prop_readonly(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
 	    zfs_prop_table[prop].pd_attr == PROP_ONETIME);
 }
 
 /*
  * Returns TRUE if the property is only allowed to be set once.
  */
 boolean_t
 zfs_prop_setonce(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
 }
 
 const char *
 zfs_prop_default_string(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_strdefault);
 }
 
 uint64_t
 zfs_prop_default_numeric(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_numdefault);
 }
 
 /*
  * Given a dataset property ID, returns the corresponding name.
  * Assuming the zfs dataset property ID is valid.
  */
 const char *
 zfs_prop_to_name(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_name);
 }
 
 /*
  * Returns TRUE if the property is inheritable.
  */
 boolean_t
 zfs_prop_inheritable(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_attr == PROP_INHERIT ||
 	    zfs_prop_table[prop].pd_attr == PROP_ONETIME);
 }
 
 #ifndef _KERNEL
 
 /*
  * Returns a string describing the set of acceptable values for the given
  * zfs property, or NULL if it cannot be set.
  */
 const char *
 zfs_prop_values(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_values);
 }
 
 /*
  * Returns TRUE if this property is a string type.  Note that index types
  * (compression, checksum) are treated as strings in userland, even though they
  * are stored numerically on disk.
  */
 int
 zfs_prop_is_string(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING ||
 	    zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
 }
 
 /*
  * Returns the column header for the given property.  Used only in
  * 'zfs list -o', but centralized here with the other property information.
  */
 const char *
 zfs_prop_column_name(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_colname);
 }
 
 /*
  * Returns whether the given property should be displayed right-justified for
  * 'zfs list'.
  */
 boolean_t
 zfs_prop_align_right(zfs_prop_t prop)
 {
 	return (zfs_prop_table[prop].pd_rightalign);
 }
 
 #endif
Index: head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c	(revision 296519)
@@ -1,236 +1,237 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 
 #include "zfs_prop.h"
 
 #if defined(_KERNEL)
 #include <sys/systm.h>
 #else
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #endif
 
 static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS];
 
 zprop_desc_t *
 zpool_prop_get_table(void)
 {
 	return (zpool_prop_table);
 }
 
 void
 zpool_prop_init(void)
 {
 	static zprop_index_t boolean_table[] = {
 		{ "off",	0},
 		{ "on",		1},
 		{ NULL }
 	};
 
 	static zprop_index_t failuremode_table[] = {
 		{ "wait",	ZIO_FAILURE_MODE_WAIT },
 		{ "continue",	ZIO_FAILURE_MODE_CONTINUE },
 		{ "panic",	ZIO_FAILURE_MODE_PANIC },
 		{ NULL }
 	};
 
 	/* string properties */
 	zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
 	    ZFS_TYPE_POOL, "<path>", "ALTROOT");
 	zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
 	    ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
 	zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
 	zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "<comment-string>", "COMMENT");
 
 	/* readonly number properties */
 	zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "SIZE");
 	zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "FREE");
 	zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "FREEING");
 	zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "LEAKED");
 	zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
 	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
 	zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
 	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
 	zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
 	    PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
 	zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "CAP");
 	zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<guid>", "GUID");
 	zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<state>", "HEALTH");
 	zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
 	    PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
 	    "DEDUP");
 
 	/* default number properties */
 	zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
 	zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
 
 	/* default index (boolean) properties */
 	zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION",
 	    boolean_table);
 	zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
 	zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS",
 	    boolean_table);
 	zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
 	zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table);
 
 	/* default index properties */
 	zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
 	    ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
 	    "wait | continue | panic", "FAILMODE", failuremode_table);
 
 	/* hidden properties */
 	zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_POOL, "NAME");
 	zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
 	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
 }
 
 /*
  * Given a property name and its type, returns the corresponding property ID.
  */
 zpool_prop_t
 zpool_name_to_prop(const char *propname)
 {
 	return (zprop_name_to_prop(propname, ZFS_TYPE_POOL));
 }
 
 /*
  * Given a pool property ID, returns the corresponding name.
  * Assuming the pool propety ID is valid.
  */
 const char *
 zpool_prop_to_name(zpool_prop_t prop)
 {
 	return (zpool_prop_table[prop].pd_name);
 }
 
 zprop_type_t
 zpool_prop_get_type(zpool_prop_t prop)
 {
 	return (zpool_prop_table[prop].pd_proptype);
 }
 
 boolean_t
 zpool_prop_readonly(zpool_prop_t prop)
 {
 	return (zpool_prop_table[prop].pd_attr == PROP_READONLY);
 }
 
 const char *
 zpool_prop_default_string(zpool_prop_t prop)
 {
 	return (zpool_prop_table[prop].pd_strdefault);
 }
 
 uint64_t
 zpool_prop_default_numeric(zpool_prop_t prop)
 {
 	return (zpool_prop_table[prop].pd_numdefault);
 }
 
 /*
  * Returns true if this is a valid feature@ property.
  */
 boolean_t
 zpool_prop_feature(const char *name)
 {
 	static const char *prefix = "feature@";
 	return (strncmp(name, prefix, strlen(prefix)) == 0);
 }
 
 /*
  * Returns true if this is a valid unsupported@ property.
  */
 boolean_t
 zpool_prop_unsupported(const char *name)
 {
 	static const char *prefix = "unsupported@";
 	return (strncmp(name, prefix, strlen(prefix)) == 0);
 }
 
 int
 zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
     uint64_t *index)
 {
 	return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL));
 }
 
 int
 zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
     const char **string)
 {
 	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
 }
 
 uint64_t
 zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
 {
 	return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
 }
 
 #ifndef _KERNEL
 
 const char *
 zpool_prop_values(zpool_prop_t prop)
 {
 	return (zpool_prop_table[prop].pd_values);
 }
 
 const char *
 zpool_prop_column_name(zpool_prop_t prop)
 {
 	return (zpool_prop_table[prop].pd_colname);
 }
 
 boolean_t
 zpool_prop_align_right(zpool_prop_t prop)
 {
 	return (zpool_prop_table[prop].pd_rightalign);
 }
 #endif
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	(revision 296519)
@@ -1,591 +1,592 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/bpobj.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 #include <sys/dsl_pool.h>
 #include <sys/zfeature.h>
 #include <sys/zap.h>
 
 /*
  * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
  */
 uint64_t
 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(os);
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 		if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 			ASSERT0(dp->dp_empty_bpobj);
 			dp->dp_empty_bpobj =
 			    bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
 			VERIFY(zap_add(os,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 			    &dp->dp_empty_bpobj, tx) == 0);
 		}
 		spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
 		ASSERT(dp->dp_empty_bpobj != 0);
 		return (dp->dp_empty_bpobj);
 	} else {
 		return (bpobj_alloc(os, blocksize, tx));
 	}
 }
 
 void
 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
 	if (!spa_feature_is_active(dmu_objset_spa(os),
 	    SPA_FEATURE_EMPTY_BPOBJ)) {
 		VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_EMPTY_BPOBJ, tx));
 		VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
 		dp->dp_empty_bpobj = 0;
 	}
 }
 
 uint64_t
 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
 	int size;
 
 	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
 		size = BPOBJ_SIZE_V0;
 	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
 		size = BPOBJ_SIZE_V1;
 	else
 		size = sizeof (bpobj_phys_t);
 
 	return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
 	    DMU_OT_BPOBJ_HDR, size, tx));
 }
 
 void
 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 {
 	int64_t i;
 	bpobj_t bpo;
 	dmu_object_info_t doi;
 	int epb;
 	dmu_buf_t *dbuf = NULL;
 
 	ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
 	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
 
 	mutex_enter(&bpo.bpo_lock);
 
 	if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
 		goto out;
 
 	VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
 	epb = doi.doi_data_block_size / sizeof (uint64_t);
 
 	for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
 		uint64_t *objarray;
 		uint64_t offset, blkoff;
 
 		offset = i * sizeof (uint64_t);
 		blkoff = P2PHASE(i, epb);
 
 		if (dbuf == NULL || dbuf->db_offset > offset) {
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
 			VERIFY3U(0, ==, dmu_buf_hold(os,
 			    bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
 		}
 
 		ASSERT3U(offset, >=, dbuf->db_offset);
 		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
 
 		objarray = dbuf->db_data;
 		bpobj_free(os, objarray[blkoff], tx);
 	}
 	if (dbuf) {
 		dmu_buf_rele(dbuf, FTAG);
 		dbuf = NULL;
 	}
 	VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
 
 out:
 	mutex_exit(&bpo.bpo_lock);
 	bpobj_close(&bpo);
 
 	VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
 }
 
 int
 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
 {
 	dmu_object_info_t doi;
 	int err;
 
 	err = dmu_object_info(os, object, &doi);
 	if (err)
 		return (err);
 
 	bzero(bpo, sizeof (*bpo));
 	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	ASSERT(bpo->bpo_dbuf == NULL);
 	ASSERT(bpo->bpo_phys == NULL);
 	ASSERT(object != 0);
 	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
 	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
 
 	err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
 	if (err)
 		return (err);
 
 	bpo->bpo_os = os;
 	bpo->bpo_object = object;
 	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
 	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
 	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
 	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
 	return (0);
 }
 
 void
 bpobj_close(bpobj_t *bpo)
 {
 	/* Lame workaround for closing a bpobj that was never opened. */
 	if (bpo->bpo_object == 0)
 		return;
 
 	dmu_buf_rele(bpo->bpo_dbuf, bpo);
 	if (bpo->bpo_cached_dbuf != NULL)
 		dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
 	bpo->bpo_dbuf = NULL;
 	bpo->bpo_phys = NULL;
 	bpo->bpo_cached_dbuf = NULL;
 	bpo->bpo_object = 0;
 
 	mutex_destroy(&bpo->bpo_lock);
 }
 
 static boolean_t
 bpobj_hasentries(bpobj_t *bpo)
 {
 	return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
 	    (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
 }
 
 static int
 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
     boolean_t free)
 {
 	dmu_object_info_t doi;
 	int epb;
 	int64_t i;
 	int err = 0;
 	dmu_buf_t *dbuf = NULL;
 
 	mutex_enter(&bpo->bpo_lock);
 
 	if (free)
 		dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 
 	for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
 		blkptr_t *bparray;
 		blkptr_t *bp;
 		uint64_t offset, blkoff;
 
 		offset = i * sizeof (blkptr_t);
 		blkoff = P2PHASE(i, bpo->bpo_epb);
 
 		if (dbuf == NULL || dbuf->db_offset > offset) {
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
 			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
 			    FTAG, &dbuf, 0);
 			if (err)
 				break;
 		}
 
 		ASSERT3U(offset, >=, dbuf->db_offset);
 		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
 
 		bparray = dbuf->db_data;
 		bp = &bparray[blkoff];
 		err = func(arg, bp, tx);
 		if (err)
 			break;
 		if (free) {
 			bpo->bpo_phys->bpo_bytes -=
 			    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
 			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
 			if (bpo->bpo_havecomp) {
 				bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
 				bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
 			}
 			bpo->bpo_phys->bpo_num_blkptrs--;
 			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
 		}
 	}
 	if (dbuf) {
 		dmu_buf_rele(dbuf, FTAG);
 		dbuf = NULL;
 	}
 	if (free) {
 		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
 		    (i + 1) * sizeof (blkptr_t), -1ULL, tx));
 	}
 	if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
 		goto out;
 
 	ASSERT(bpo->bpo_havecomp);
 	err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
 	if (err) {
 		mutex_exit(&bpo->bpo_lock);
 		return (err);
 	}
 	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
 	epb = doi.doi_data_block_size / sizeof (uint64_t);
 
 	for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
 		uint64_t *objarray;
 		uint64_t offset, blkoff;
 		bpobj_t sublist;
 		uint64_t used_before, comp_before, uncomp_before;
 		uint64_t used_after, comp_after, uncomp_after;
 
 		offset = i * sizeof (uint64_t);
 		blkoff = P2PHASE(i, epb);
 
 		if (dbuf == NULL || dbuf->db_offset > offset) {
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
 			err = dmu_buf_hold(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
 			if (err)
 				break;
 		}
 
 		ASSERT3U(offset, >=, dbuf->db_offset);
 		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
 
 		objarray = dbuf->db_data;
 		err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
 		if (err)
 			break;
 		if (free) {
 			err = bpobj_space(&sublist,
 			    &used_before, &comp_before, &uncomp_before);
 			if (err != 0) {
 				bpobj_close(&sublist);
 				break;
 			}
 		}
 		err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
 		if (free) {
 			VERIFY3U(0, ==, bpobj_space(&sublist,
 			    &used_after, &comp_after, &uncomp_after));
 			bpo->bpo_phys->bpo_bytes -= used_before - used_after;
 			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
 			bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
 			bpo->bpo_phys->bpo_uncomp -=
 			    uncomp_before - uncomp_after;
 		}
 
 		bpobj_close(&sublist);
 		if (err)
 			break;
 		if (free) {
 			err = dmu_object_free(bpo->bpo_os,
 			    objarray[blkoff], tx);
 			if (err)
 				break;
 			bpo->bpo_phys->bpo_num_subobjs--;
 			ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
 		}
 	}
 	if (dbuf) {
 		dmu_buf_rele(dbuf, FTAG);
 		dbuf = NULL;
 	}
 	if (free) {
 		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
 		    bpo->bpo_phys->bpo_subobjs,
 		    (i + 1) * sizeof (uint64_t), -1ULL, tx));
 	}
 
 out:
 	/* If there are no entries, there should be no bytes. */
 	if (!bpobj_hasentries(bpo)) {
 		ASSERT0(bpo->bpo_phys->bpo_bytes);
 		ASSERT0(bpo->bpo_phys->bpo_comp);
 		ASSERT0(bpo->bpo_phys->bpo_uncomp);
 	}
 
 	mutex_exit(&bpo->bpo_lock);
 	return (err);
 }
 
 /*
  * Iterate and remove the entries.  If func returns nonzero, iteration
  * will stop and that entry will not be removed.
  */
 int
 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
 {
 	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
 }
 
 /*
  * Iterate the entries.  If func returns nonzero, iteration will stop.
  */
 int
 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
 {
 	return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
 }
 
 void
 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 {
 	bpobj_t subbpo;
 	uint64_t used, comp, uncomp, subsubobjs;
 
 	ASSERT(bpo->bpo_havesubobj);
 	ASSERT(bpo->bpo_havecomp);
 	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
 	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
 		bpobj_decr_empty(bpo->bpo_os, tx);
 		return;
 	}
 
 	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
 	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
 
 	if (!bpobj_hasentries(&subbpo)) {
 		/* No point in having an empty subobj. */
 		bpobj_close(&subbpo);
 		bpobj_free(bpo->bpo_os, subobj, tx);
 		return;
 	}
 
 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 	if (bpo->bpo_phys->bpo_subobjs == 0) {
 		bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
 		    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
 		    DMU_OT_NONE, 0, tx);
 	}
 
 	dmu_object_info_t doi;
 	ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
 	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
 
 	mutex_enter(&bpo->bpo_lock);
 	dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 	    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
 	    sizeof (subobj), &subobj, tx);
 	bpo->bpo_phys->bpo_num_subobjs++;
 
 	/*
 	 * If subobj has only one block of subobjs, then move subobj's
 	 * subobjs to bpo's subobj list directly.  This reduces
 	 * recursion in bpobj_iterate due to nested subobjs.
 	 */
 	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
 	if (subsubobjs != 0) {
 		dmu_object_info_t doi;
 
 		VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
 		if (doi.doi_max_offset == doi.doi_data_block_size) {
 			dmu_buf_t *subdb;
 			uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
 
 			VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
 			    0, FTAG, &subdb, 0));
 			/*
 			 * Make sure that we are not asking dmu_write()
 			 * to write more data than we have in our buffer.
 			 */
 			VERIFY3U(subdb->db_size, >=,
 			    numsubsub * sizeof (subobj));
 			dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 			    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
 			    numsubsub * sizeof (subobj), subdb->db_data, tx);
 			dmu_buf_rele(subdb, FTAG);
 			bpo->bpo_phys->bpo_num_subobjs += numsubsub;
 
 			dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
 			subbpo.bpo_phys->bpo_subobjs = 0;
 			VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
 			    subsubobjs, tx));
 		}
 	}
 	bpo->bpo_phys->bpo_bytes += used;
 	bpo->bpo_phys->bpo_comp += comp;
 	bpo->bpo_phys->bpo_uncomp += uncomp;
 	mutex_exit(&bpo->bpo_lock);
 
 	bpobj_close(&subbpo);
 }
 
 void
 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	blkptr_t stored_bp = *bp;
 	uint64_t offset;
 	int blkoff;
 	blkptr_t *bparray;
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
 	if (BP_IS_EMBEDDED(bp)) {
 		/*
 		 * The bpobj will compress better without the payload.
 		 *
 		 * Note that we store EMBEDDED bp's because they have an
 		 * uncompressed size, which must be accounted for.  An
 		 * alternative would be to add their size to bpo_uncomp
 		 * without storing the bp, but that would create additional
 		 * complications: bpo_uncomp would be inconsistent with the
 		 * set of BP's stored, and bpobj_iterate() wouldn't visit
 		 * all the space accounted for in the bpobj.
 		 */
 		bzero(&stored_bp, sizeof (stored_bp));
 		stored_bp.blk_prop = bp->blk_prop;
 		stored_bp.blk_birth = bp->blk_birth;
 	} else if (!BP_GET_DEDUP(bp)) {
 		/* The bpobj will compress better without the checksum */
 		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
 	}
 
 	/* We never need the fill count. */
 	stored_bp.blk_fill = 0;
 
 	mutex_enter(&bpo->bpo_lock);
 
 	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
 	blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
 
 	if (bpo->bpo_cached_dbuf == NULL ||
 	    offset < bpo->bpo_cached_dbuf->db_offset ||
 	    offset >= bpo->bpo_cached_dbuf->db_offset +
 	    bpo->bpo_cached_dbuf->db_size) {
 		if (bpo->bpo_cached_dbuf)
 			dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
 		VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
 		    offset, bpo, &bpo->bpo_cached_dbuf, 0));
 	}
 
 	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
 	bparray = bpo->bpo_cached_dbuf->db_data;
 	bparray[blkoff] = stored_bp;
 
 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 	bpo->bpo_phys->bpo_num_blkptrs++;
 	bpo->bpo_phys->bpo_bytes +=
 	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
 	if (bpo->bpo_havecomp) {
 		bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
 		bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
 	}
 	mutex_exit(&bpo->bpo_lock);
 }
 
 struct space_range_arg {
 	spa_t *spa;
 	uint64_t mintxg;
 	uint64_t maxtxg;
 	uint64_t used;
 	uint64_t comp;
 	uint64_t uncomp;
 };
 
 /* ARGSUSED */
 static int
 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	struct space_range_arg *sra = arg;
 
 	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
 		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
 			sra->used += bp_get_dsize_sync(sra->spa, bp);
 		else
 			sra->used += bp_get_dsize(sra->spa, bp);
 		sra->comp += BP_GET_PSIZE(bp);
 		sra->uncomp += BP_GET_UCSIZE(bp);
 	}
 	return (0);
 }
 
 int
 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	mutex_enter(&bpo->bpo_lock);
 
 	*usedp = bpo->bpo_phys->bpo_bytes;
 	if (bpo->bpo_havecomp) {
 		*compp = bpo->bpo_phys->bpo_comp;
 		*uncompp = bpo->bpo_phys->bpo_uncomp;
 		mutex_exit(&bpo->bpo_lock);
 		return (0);
 	} else {
 		mutex_exit(&bpo->bpo_lock);
 		return (bpobj_space_range(bpo, 0, UINT64_MAX,
 		    usedp, compp, uncompp));
 	}
 }
 
 /*
  * Return the amount of space in the bpobj which is:
  * mintxg < blk_birth <= maxtxg
  */
 int
 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	struct space_range_arg sra = { 0 };
 	int err;
 
 	/*
 	 * As an optimization, if they want the whole txg range, just
 	 * get bpo_bytes rather than iterating over the bps.
 	 */
 	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
 		return (bpobj_space(bpo, usedp, compp, uncompp));
 
 	sra.spa = dmu_objset_spa(bpo->bpo_os);
 	sra.mintxg = mintxg;
 	sra.maxtxg = maxtxg;
 
 	err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
 	*usedp = sra.used;
 	*compp = sra.comp;
 	*uncompp = sra.uncomp;
 	return (err);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	(revision 296519)
@@ -1,299 +1,300 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/arc.h>
 #include <sys/bptree.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dnode.h>
 #include <sys/refcount.h>
 #include <sys/spa.h>
 
 /*
  * A bptree is a queue of root block pointers from destroyed datasets. When a
  * dataset is destroyed its root block pointer is put on the end of the pool's
  * bptree queue so the dataset's blocks can be freed asynchronously by
  * dsl_scan_sync. This allows the delete operation to finish without traversing
  * all the dataset's blocks.
  *
  * Note that while bt_begin and bt_end are only ever incremented in this code,
  * they are effectively reset to 0 every time the entire bptree is freed because
  * the bptree's object is destroyed and re-created.
  */
 
 struct bptree_args {
 	bptree_phys_t *ba_phys;	/* data in bonus buffer, dirtied if freeing */
 	boolean_t ba_free;	/* true if freeing during traversal */
 
 	bptree_itor_t *ba_func;	/* function to call for each blockpointer */
 	void *ba_arg;		/* caller supplied argument to ba_func */
 	dmu_tx_t *ba_tx;	/* caller supplied tx, NULL if not freeing */
 } bptree_args_t;
 
 uint64_t
 bptree_alloc(objset_t *os, dmu_tx_t *tx)
 {
 	uint64_t obj;
 	dmu_buf_t *db;
 	bptree_phys_t *bt;
 
 	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
 	    SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
 	    sizeof (bptree_phys_t), tx);
 
 	/*
 	 * Bonus buffer contents are already initialized to 0, but for
 	 * readability we make it explicit.
 	 */
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	bt = db->db_data;
 	bt->bt_begin = 0;
 	bt->bt_end = 0;
 	bt->bt_bytes = 0;
 	bt->bt_comp = 0;
 	bt->bt_uncomp = 0;
 	dmu_buf_rele(db, FTAG);
 
 	return (obj);
 }
 
 int
 bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	bptree_phys_t *bt;
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	ASSERT3U(bt->bt_begin, ==, bt->bt_end);
 	ASSERT0(bt->bt_bytes);
 	ASSERT0(bt->bt_comp);
 	ASSERT0(bt->bt_uncomp);
 	dmu_buf_rele(db, FTAG);
 
 	return (dmu_object_free(os, obj, tx));
 }
 
 boolean_t
 bptree_is_empty(objset_t *os, uint64_t obj)
 {
 	dmu_buf_t *db;
 	bptree_phys_t *bt;
 	boolean_t rv;
 
 	VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	rv = (bt->bt_begin == bt->bt_end);
 	dmu_buf_rele(db, FTAG);
 	return (rv);
 }
 
 void
 bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
     uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	bptree_phys_t *bt;
 	bptree_entry_phys_t bte = { 0 };
 
 	/*
 	 * bptree objects are in the pool mos, therefore they can only be
 	 * modified in syncing context. Furthermore, this is only modified
 	 * by the sync thread, so no locking is necessary.
 	 */
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 
 	bte.be_birth_txg = birth_txg;
 	bte.be_bp = *bp;
 	dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
 
 	dmu_buf_will_dirty(db, tx);
 	bt->bt_end++;
 	bt->bt_bytes += bytes;
 	bt->bt_comp += comp;
 	bt->bt_uncomp += uncomp;
 	dmu_buf_rele(db, FTAG);
 }
 
 /* ARGSUSED */
 static int
 bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	int err;
 	struct bptree_args *ba = arg;
 
 	if (bp == NULL || BP_IS_HOLE(bp))
 		return (0);
 
 	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
 	if (err == 0 && ba->ba_free) {
 		ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
 		ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
 		ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
 	}
 	return (err);
 }
 
 /*
  * If "free" is set:
  *  - It is assumed that "func" will be freeing the block pointers.
  *  - If "func" returns nonzero, the bookmark will be remembered and
  *    iteration will be restarted from this point on next invocation.
  *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
  *    bptree_iterate will remember the bookmark, continue traversing
  *    any additional entries, and return 0.
  *
  * If "free" is not set, traversal will stop and return an error if
  * an i/o error is encountered.
  *
  * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
  * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
  * traverse_dataset_destroyed()).
  */
 int
 bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
     void *arg, dmu_tx_t *tx)
 {
 	boolean_t ioerr = B_FALSE;
 	int err;
 	uint64_t i;
 	dmu_buf_t *db;
 	struct bptree_args ba;
 
 	ASSERT(!free || dmu_tx_is_syncing(tx));
 
 	err = dmu_bonus_hold(os, obj, FTAG, &db);
 	if (err != 0)
 		return (err);
 
 	if (free)
 		dmu_buf_will_dirty(db, tx);
 
 	ba.ba_phys = db->db_data;
 	ba.ba_free = free;
 	ba.ba_func = func;
 	ba.ba_arg = arg;
 	ba.ba_tx = tx;
 
 	err = 0;
 	for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
 		bptree_entry_phys_t bte;
 		int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
 
 		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
 		    &bte, DMU_READ_NO_PREFETCH);
 		if (err != 0)
 			break;
 
 		if (zfs_free_leak_on_eio)
 			flags |= TRAVERSE_HARD;
 		zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
 		    "bookmark %lld/%lld/%lld/%lld",
 		    i, (longlong_t)bte.be_birth_txg,
 		    (longlong_t)bte.be_zb.zb_objset,
 		    (longlong_t)bte.be_zb.zb_object,
 		    (longlong_t)bte.be_zb.zb_level,
 		    (longlong_t)bte.be_zb.zb_blkid);
 		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
 		    bte.be_birth_txg, &bte.be_zb, flags,
 		    bptree_visit_cb, &ba);
 		if (free) {
 			/*
 			 * The callback has freed the visited block pointers.
 			 * Record our traversal progress on disk, either by
 			 * updating this record's bookmark, or by logically
 			 * removing this record by advancing bt_begin.
 			 */
 			if (err != 0) {
 				/* save bookmark for future resume */
 				ASSERT3U(bte.be_zb.zb_objset, ==,
 				    ZB_DESTROYED_OBJSET);
 				ASSERT0(bte.be_zb.zb_level);
 				dmu_write(os, obj, i * sizeof (bte),
 				    sizeof (bte), &bte, tx);
 				if (err == EIO || err == ECKSUM ||
 				    err == ENXIO) {
 					/*
 					 * Skip the rest of this tree and
 					 * continue on to the next entry.
 					 */
 					err = 0;
 					ioerr = B_TRUE;
 				} else {
 					break;
 				}
 			} else if (ioerr) {
 				/*
 				 * This entry is finished, but there were
 				 * i/o errors on previous entries, so we
 				 * can't adjust bt_begin.  Set this entry's
 				 * be_birth_txg such that it will be
 				 * treated as a no-op in future traversals.
 				 */
 				bte.be_birth_txg = UINT64_MAX;
 				dmu_write(os, obj, i * sizeof (bte),
 				    sizeof (bte), &bte, tx);
 			}
 
 			if (!ioerr) {
 				ba.ba_phys->bt_begin++;
 				(void) dmu_free_range(os, obj,
 				    i * sizeof (bte), sizeof (bte), tx);
 			}
 		} else if (err != 0) {
 			break;
 		}
 	}
 
 	ASSERT(!free || err != 0 || ioerr ||
 	    ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
 
 	/* if all blocks are free there should be no used space */
 	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
 		if (zfs_free_leak_on_eio) {
 			ba.ba_phys->bt_bytes = 0;
 			ba.ba_phys->bt_comp = 0;
 			ba.ba_phys->bt_uncomp = 0;
 		}
 
 		ASSERT0(ba.ba_phys->bt_bytes);
 		ASSERT0(ba.ba_phys->bt_comp);
 		ASSERT0(ba.ba_phys->bt_uncomp);
 	}
 
 	dmu_buf_rele(db, FTAG);
 
 	return (err);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	(revision 296519)
@@ -1,3215 +1,3216 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_tx.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/blkptr.h>
 #include <sys/range_tree.h>
 
 /*
  * Number of times that zfs_free_range() took the slow path while doing
  * a zfs receive.  A nonzero value indicates a potential performance problem.
  */
 uint64_t zfs_free_range_recv_miss;
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
 #ifndef __lint
 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
     dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
 #endif /* ! __lint */
 
 /*
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_cache;
 static taskq_t *dbu_evict_taskq;
 
 /* ARGSUSED */
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
 	dmu_buf_impl_t *db = vdb;
 	bzero(db, sizeof (dmu_buf_impl_t));
 
 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 	refcount_create(&db->db_holds);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dbuf_dest(void *vdb, void *unused)
 {
 	dmu_buf_impl_t *db = vdb;
 	mutex_destroy(&db->db_mtx);
 	cv_destroy(&db->db_changed);
 	refcount_destroy(&db->db_holds);
 }
 
 /*
  * dbuf hash table routines
  */
 static dbuf_hash_table_t dbuf_hash_table;
 
 static uint64_t dbuf_hash_count;
 
 static uint64_t
 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
 {
 	uintptr_t osv = (uintptr_t)os;
 	uint64_t crc = -1ULL;
 
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
 
 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
 
 	return (crc);
 }
 
 #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
 
 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
 	((dbuf)->db.db_object == (obj) &&		\
 	(dbuf)->db_objset == (os) &&			\
 	(dbuf)->db_level == (level) &&			\
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *db;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
 			if (db->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
 			mutex_exit(&db->db_mtx);
 		}
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	return (NULL);
 }
 
 static dmu_buf_impl_t *
 dbuf_find_bonus(objset_t *os, uint64_t object)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *db = NULL;
 
 	if (dnode_hold(os, object, FTAG, &dn) == 0) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		if (dn->dn_bonus != NULL) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 		}
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 	}
 	return (db);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  */
 static dmu_buf_impl_t *
 dbuf_hash_insert(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	objset_t *os = db->db_objset;
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid = db->db_blkid;
 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
 			if (dbf->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
 			mutex_exit(&dbf->db_mtx);
 		}
 	}
 
 	mutex_enter(&db->db_mtx);
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	atomic_inc_64(&dbuf_hash_count);
 
 	return (NULL);
 }
 
 /*
  * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf, **dbp;
 
 	/*
 	 * We musn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	dbp = &h->hash_table[idx];
 	while ((dbf = *dbp) != db) {
 		dbp = &dbf->db_hash_next;
 		ASSERT(dbf != NULL);
 	}
 	*dbp = db->db_hash_next;
 	db->db_hash_next = NULL;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	atomic_dec_64(&dbuf_hash_count);
 }
 
 static arc_evict_func_t dbuf_do_evict;
 
 typedef enum {
 	DBVU_EVICTING,
 	DBVU_NOT_EVICTING
 } dbvu_verify_type_t;
 
 static void
 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
 {
 #ifdef ZFS_DEBUG
 	int64_t holds;
 
 	if (db->db_user == NULL)
 		return;
 
 	/* Only data blocks support the attachment of user data. */
 	ASSERT(db->db_level == 0);
 
 	/* Clients must resolve a dbuf before attaching user data. */
 	ASSERT(db->db.db_data != NULL);
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 
 	holds = refcount_count(&db->db_holds);
 	if (verify_type == DBVU_EVICTING) {
 		/*
 		 * Immediate eviction occurs when holds == dirtycnt.
 		 * For normal eviction buffers, holds is zero on
 		 * eviction, except when dbuf_fix_old_data() calls
 		 * dbuf_clear_data().  However, the hold count can grow
 		 * during eviction even though db_mtx is held (see
 		 * dmu_bonus_hold() for an example), so we can only
 		 * test the generic invariant that holds >= dirtycnt.
 		 */
 		ASSERT3U(holds, >=, db->db_dirtycnt);
 	} else {
 		if (db->db_user_immediate_evict == TRUE)
 			ASSERT3U(holds, >=, db->db_dirtycnt);
 		else
 			ASSERT3U(holds, >, 0);
 	}
 #endif
 }
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
 	dmu_buf_user_t *dbu = db->db_user;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (dbu == NULL)
 		return;
 
 	dbuf_verify_user(db, DBVU_EVICTING);
 	db->db_user = NULL;
 
 #ifdef ZFS_DEBUG
 	if (dbu->dbu_clear_on_evict_dbufp != NULL)
 		*dbu->dbu_clear_on_evict_dbufp = NULL;
 #endif
 
 	/*
 	 * Invoke the callback from a taskq to avoid lock order reversals
 	 * and limit stack depth.
 	 */
 	taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
 	    &dbu->dbu_tqent);
 }
 
 boolean_t
 dbuf_is_metadata(dmu_buf_impl_t *db)
 {
 	if (db->db_level > 0) {
 		return (B_TRUE);
 	} else {
 		boolean_t is_metadata;
 
 		DB_DNODE_ENTER(db);
 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 		DB_DNODE_EXIT(db);
 
 		return (is_metadata);
 	}
 }
 
 void
 dbuf_evict(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
 	dbuf_clear(db);
 	dbuf_destroy(db);
 }
 
 void
 dbuf_init(void)
 {
 	uint64_t hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average 4K block size.  The table will take up
 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 	 */
 	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
 		hsize <<= 1;
 
 retry:
 	h->hash_table_mask = hsize - 1;
 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 	if (h->hash_table == NULL) {
 		/* XXX - we should really return an error instead of assert */
 		ASSERT(hsize > (1ULL << 10));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
 	/*
 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
 	 * configuration is not required.
 	 */
 	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
 }
 
 void
 dbuf_fini(void)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 	kmem_cache_destroy(dbuf_cache);
 	taskq_destroy(dbu_evict_taskq);
 }
 
 /*
  * Other stuff.
  */
 
 #ifdef ZFS_DEBUG
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dbuf_dirty_record_t *dr;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 		return;
 
 	ASSERT(db->db_objset != NULL);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn == NULL) {
 		ASSERT(db->db_parent == NULL);
 		ASSERT(db->db_blkptr == NULL);
 	} else {
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 		    db->db_blkid == DMU_SPILL_BLKID ||
 		    !avl_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT0(db->db.db_offset);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
 
 	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
 		ASSERT(dr->dr_dbuf == db);
 
 	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
 		ASSERT(dr->dr_dbuf == db);
 
 	/*
 	 * We can't assert that db_size matches dn_datablksz because it
 	 * can be momentarily different when another thread is doing
 	 * dnode_set_blksz().
 	 */
 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dr = db->db_data_pending;
 		/*
 		 * It should only be modified in syncing context, so
 		 * make sure we only have one copy of the data.
 		 */
 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 	}
 
 	/* verify db->db_blkptr */
 	if (db->db_blkptr) {
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
 			if (db->db_blkid != DMU_SPILL_BLKID)
 				ASSERT3P(db->db_blkptr, ==,
 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		} else {
 			/* db is pointed to by an indirect block */
 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 			ASSERT3U(db->db_parent->db.db_object, ==,
 			    db->db.db_object);
 			/*
 			 * dnode_grow_indblksz() can make this fail if we don't
 			 * have the struct_rwlock.  XXX indblksz no longer
 			 * grows.  safe to do this now?
 			 */
 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 				ASSERT3P(db->db_blkptr, ==,
 				    ((blkptr_t *)db->db_parent->db.db_data +
 				    db->db_blkid % epb));
 			}
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 	    (db->db_buf == NULL || db->db_buf->b_data) &&
 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
 		/*
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
 		 * data when we evict this buffer.
 		 */
 		if (db->db_dirtycnt == 0) {
 			uint64_t *buf = db->db.db_data;
 			int i;
 
 			for (i = 0; i < db->db.db_size >> 3; i++) {
 				ASSERT(buf[i] == 0);
 			}
 		}
 	}
 	DB_DNODE_EXIT(db);
 }
 #endif
 
 static void
 dbuf_clear_data(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	dbuf_evict_user(db);
 	db->db_buf = NULL;
 	db->db.db_data = NULL;
 	if (db->db_state != DB_NOFILL)
 		db->db_state = DB_UNCACHED;
 }
 
 static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(buf != NULL);
 
 	db->db_buf = buf;
 	ASSERT(buf->b_data != NULL);
 	db->db.db_data = buf->b_data;
 	if (!arc_released(buf))
 		arc_set_callback(buf, dbuf_do_evict, db);
 }
 
 /*
  * Loan out an arc_buf for read.  Return the loaned arc_buf.
  */
 arc_buf_t *
 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 {
 	arc_buf_t *abuf;
 
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
 		spa_t *spa = db->db_objset->os_spa;
 
 		mutex_exit(&db->db_mtx);
 		abuf = arc_loan_buf(spa, blksz);
 		bcopy(db->db.db_data, abuf->b_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
 		dbuf_clear_data(db);
 		mutex_exit(&db->db_mtx);
 	}
 	return (abuf);
 }
 
 /*
  * Calculate which level n block references the data at the level 0 offset
  * provided.
  */
 uint64_t
 dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
 {
 	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
 		/*
 		 * The level n blkid is equal to the level 0 blkid divided by
 		 * the number of level 0s in a level n block.
 		 *
 		 * The level 0 blkid is offset >> datablkshift =
 		 * offset / 2^datablkshift.
 		 *
 		 * The number of level 0s in a level n is the number of block
 		 * pointers in an indirect block, raised to the power of level.
 		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
 		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
 		 *
 		 * Thus, the level n blkid is: offset /
 		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
 		 * = offset / 2^(datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 * = offset >> (datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 */
 		return (offset >> (dn->dn_datablkshift + level *
 		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
 	}
 }
 
 static void
 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(db->db_state, ==, DB_READ);
 	/*
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
 	ASSERT(refcount_count(&db->db_holds) > 0);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
 		arc_release(buf, db);
 		bzero(buf->b_data, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else if (zio == NULL || zio->io_error == 0) {
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
 		VERIFY(arc_buf_remove_ref(buf, db));
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
 	dbuf_rele_and_unlock(db, NULL);
 }
 
 static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	dnode_t *dn;
 	zbookmark_phys_t zb;
 	arc_flags_t aflags = ARC_FLAG_NOWAIT;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED);
 	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 
 		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
 		if (bonuslen)
 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 		DB_DNODE_EXIT(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	/*
 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 	 * processes the delete record and clears the bp while we are waiting
 	 * for the dn_mtx (resulting in a "no" from block_freed).
 	 */
 	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
 	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
 	    BP_IS_HOLE(db->db_blkptr)))) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 		DB_DNODE_EXIT(db);
 		dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
 		    db->db.db_size, db, type));
 		bzero(db->db.db_data, db->db.db_size);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	DB_DNODE_EXIT(db);
 
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
 	if (DBUF_IS_L2CACHEABLE(db))
 		aflags |= ARC_FLAG_L2CACHE;
 	if (DBUF_IS_L2COMPRESSIBLE(db))
 		aflags |= ARC_FLAG_L2COMPRESS;
 
 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	dbuf_add_ref(db, NULL);
 
 	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
 }
 
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t havepzio = (zio != NULL);
 	boolean_t prefetch;
 	dnode_t *dn;
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
 	 * can't be freed while we have a hold on the buffer.
 	 */
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	if (db->db_state == DB_NOFILL)
 		return (SET_ERROR(EIO));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 	    DBUF_IS_CACHEABLE(db);
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 	} else if (db->db_state == DB_UNCACHED) {
 		spa_t *spa = dn->dn_objset->os_spa;
 
 		if (zio == NULL)
 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 		dbuf_read_impl(db, zio, flags);
 
 		/* dbuf_read_impl has dropped db_mtx for us */
 
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
 
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 
 		if (!havepzio)
 			err = zio_wait(zio);
 	} else {
 		/*
 		 * Another reader came in while the dbuf was in flight
 		 * between UNCACHED and CACHED.  Either a writer will finish
 		 * writing the buffer (sending the dbuf to CACHED) or the
 		 * first reader's request will reach the read_done callback
 		 * and send the dbuf to CACHED.  Otherwise, a failure
 		 * occurred and the dbuf went to UNCACHED.
 		 */
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 
 		/* Skip the wait per the caller's request. */
 		mutex_enter(&db->db_mtx);
 		if ((flags & DB_RF_NEVERWAIT) == 0) {
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL) {
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
 				    db, zio_t *, zio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 		}
 		mutex_exit(&db->db_mtx);
 	}
 
 	ASSERT(err || havepzio || db->db_state == DB_CACHED);
 	return (err);
 }
 
 static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 		db->db_state = DB_FILL;
 	} else if (db->db_state == DB_NOFILL) {
 		dbuf_clear_data(db);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 /*
  * This is our just-in-time copy function.  It makes a copy of
  * buffers, that have been modified in a previous transaction
  * group, before we modify them in the current active group.
  *
  * This function is used in two places: when we are dirtying a
  * buffer for the first time in a txg, and when we are freeing
  * a range in a dnode that includes this buffer.
  *
  * Note that when we are called from dbuf_free_range() we do
  * not put a hold on the buffer, we just traverse the active
  * dbuf list for the dnode.
  */
 static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
 	ASSERT(db->db_level == 0);
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
 	if (dr == NULL ||
 	    (dr->dt.dl.dr_data !=
 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 		return;
 
 	/*
 	 * If the last dirty record for this dbuf has not yet synced
 	 * and its referencing the dbuf data, either:
 	 *	reset the reference to point to a new copy,
 	 * or (if there a no active holders)
 	 *	just null out the current db_data pointer.
 	 */
 	ASSERT(dr->dr_txg >= txg - 2);
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		/* Note that the data bufs here are zio_bufs */
 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = db->db.db_size;
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 
 		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 	} else {
 		dbuf_clear_data(db);
 	}
 }
 
 void
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 	uint64_t txg = dr->dr_txg;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 	ASSERT(db->db_level == 0);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 		zio_free(db->db_objset->os_spa, txg, bp);
 
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
 
 	/*
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
 	 * modifying the buffer, so they will immediately do
 	 * another (redundant) arc_release().  Therefore, leave
 	 * the buf thawed to save the effort of freezing &
 	 * immediately re-thawing it.
 	 */
 	arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
  * empty blocks.
  *
  * This is a no-op if the dataset is in the middle of an incremental
  * receive; see comment below for details.
  */
 void
 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t db_search;
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
 	avl_index_t where;
 
 	if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
 		end_blkid = dn->dn_maxblkid;
 	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
 
 	db_search.db_level = 0;
 	db_search.db_blkid = start_blkid;
 	db_search.db_state = DB_SEARCH;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	if (start_blkid >= dn->dn_unlisted_l0_blkid) {
 		/* There can't be any dbufs in this range; no need to search. */
 #ifdef DEBUG
 		db = avl_find(&dn->dn_dbufs, &db_search, &where);
 		ASSERT3P(db, ==, NULL);
 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 		ASSERT(db == NULL || db->db_level > 0);
 #endif
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return;
 	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
 		/*
 		 * If we are receiving, we expect there to be no dbufs in
 		 * the range to be freed, because receive modifies each
 		 * block at most once, and in offset order.  If this is
 		 * not the case, it can lead to performance problems,
 		 * so note that we unexpectedly took the slow path.
 		 */
 		atomic_inc_64(&zfs_free_range_recv_miss);
 	}
 
 	db = avl_find(&dn->dn_dbufs, &db_search, &where);
 	ASSERT3P(db, ==, NULL);
 	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
 	for (; db != NULL; db = db_next) {
 		db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 		if (db->db_level != 0 || db->db_blkid > end_blkid) {
 			break;
 		}
 		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/* found a level 0 buffer in the range */
 		mutex_enter(&db->db_mtx);
 		if (dbuf_undirty(db, tx)) {
 			/* mutex has been dropped and dbuf destroyed */
 			continue;
 		}
 
 		if (db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 			/* will be handled in dbuf_read_done or dbuf_rele */
 			db->db_freed_in_flight = TRUE;
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (refcount_count(&db->db_holds) == 0) {
 			ASSERT(db->db_buf);
 			dbuf_clear(db);
 			continue;
 		}
 		/* The dbuf is referenced */
 
 		if (db->db_last_dirty != NULL) {
 			dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 			if (dr->dr_txg == txg) {
 				/*
 				 * This buffer is "in-use", re-adjust the file
 				 * size to reflect that this buffer may
 				 * contain new data when we sync.
 				 */
 				if (db->db_blkid != DMU_SPILL_BLKID &&
 				    db->db_blkid > dn->dn_maxblkid)
 					dn->dn_maxblkid = db->db_blkid;
 				dbuf_unoverride(dr);
 			} else {
 				/*
 				 * This dbuf is not dirty in the open context.
 				 * Either uncache it (if its not referenced in
 				 * the open context) or reset its contents to
 				 * empty.
 				 */
 				dbuf_fix_old_data(db, txg);
 			}
 		}
 		/* clear the contents if its cached */
 		if (db->db_state == DB_CACHED) {
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			bzero(db->db.db_data, db->db.db_size);
 			arc_buf_freeze(db->db_buf);
 		}
 
 		mutex_exit(&db->db_mtx);
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
 static int
 dbuf_block_freeable(dmu_buf_impl_t *db)
 {
 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 	uint64_t birth_txg = 0;
 
 	/*
 	 * We don't need any locking to protect db_blkptr:
 	 * If it's syncing, then db_last_dirty will be set
 	 * so we'll ignore db_blkptr.
 	 *
 	 * This logic ensures that only block births for
 	 * filled blocks are considered.
 	 */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	if (db->db_last_dirty && (db->db_blkptr == NULL ||
 	    !BP_IS_HOLE(db->db_blkptr))) {
 		birth_txg = db->db_last_dirty->dr_txg;
 	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
 		birth_txg = db->db_blkptr->blk_birth;
 	}
 
 	/*
 	 * If this block don't exist or is in a snapshot, it can't be freed.
 	 * Don't pass the bp to dsl_dataset_block_freeable() since we
 	 * are holding the db_mtx lock and might deadlock if we are
 	 * prefetching a dedup-ed block.
 	 */
 	if (birth_txg != 0)
 		return (ds == NULL ||
 		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
 	else
 		return (B_FALSE);
 }
 
 void
 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 {
 	arc_buf_t *buf, *obuf;
 	int osize = db->db.db_size;
 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 	dnode_t *dn;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	/* XXX does *this* func really need the lock? */
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
 	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
 	 * is OK, because there can be no other references to the db
 	 * when we are changing its size, so no concurrent DB_FILL can
 	 * be happening.
 	 */
 	/*
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
 	 */
 	dmu_buf_will_dirty(&db->db, tx);
 
 	/* create the data buffer for the new block */
 	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
 
 	/* copy old block data to the new block */
 	obuf = db->db_buf;
 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
 	/* zero the remainder */
 	if (size > osize)
 		bzero((uint8_t *)buf->b_data + osize, size - osize);
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
 	VERIFY(arc_buf_remove_ref(obuf, db));
 	db->db.db_size = size;
 
 	if (db->db_level == 0) {
 		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 		db->db_last_dirty->dt.dl.dr_data = buf;
 	}
 	mutex_exit(&db->db_mtx);
 
 	dnode_willuse_space(dn, size-osize, tx);
 	DB_DNODE_EXIT(db);
 }
 
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
 	objset_t *os = db->db_objset;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(arc_released(os->os_phys_buf) ||
 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 
 	(void) arc_release(db->db_buf, db);
 }
 
 /*
  * We already have a dirty record for this TXG, and we are being
  * dirtied again.
  */
 static void
 dbuf_redirty(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * If this buffer has already been written out,
 		 * we now need to reset its state.
 		 */
 		dbuf_unoverride(dr);
 		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
 		    db->db_state != DB_NOFILL) {
 			/* Already released on initial dirty, so just thaw. */
 			ASSERT(arc_released(db->db_buf));
 			arc_buf_thaw(db->db_buf);
 		}
 	}
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	objset_t *os;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
 	boolean_t do_free_accounting = B_FALSE;
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	DMU_TX_DIRTY_BUF(tx, db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/*
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
 	 */
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    dn->dn_objset->os_dsl_dataset == NULL);
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
 	 * in syncing context.
 	 */
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * XXX make this true for indirects too?  The problem is that
 	 * transactions created with dmu_tx_create_assigned() from
 	 * syncing context don't bother holding ahead.
 	 */
 	ASSERT(db->db_level != 0 ||
 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
 	    db->db_state == DB_NOFILL);
 
 	mutex_enter(&dn->dn_mtx);
 	/*
 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
 	 * initialize the objset.
 	 */
 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
 	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
 		dn->dn_dirtyctx =
 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
 		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
 	}
 	mutex_exit(&dn->dn_mtx);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		dn->dn_have_spill = B_TRUE;
 
 	/*
 	 * If this buffer is already dirty, we're done.
 	 */
 	drp = &db->db_last_dirty;
 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
 	    db->db.db_object == DMU_META_DNODE_OBJECT);
 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
 		drp = &dr->dr_next;
 	if (dr && dr->dr_txg == tx->tx_txg) {
 		DB_DNODE_EXIT(db);
 
 		dbuf_redirty(dr);
 		mutex_exit(&db->db_mtx);
 		return (dr);
 	}
 
 	/*
 	 * Only valid if not already dirty.
 	 */
 	ASSERT(dn->dn_object == 0 ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
 	    dn->dn_phys->dn_nlevels > db->db_level ||
 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
 
 	/*
 	 * We should only be dirtying in syncing context if it's the
 	 * mos or we're initializing the os or it's a special object.
 	 * However, we are allowed to dirty in syncing context provided
 	 * we already dirtied it in open context.  Hence we must make
 	 * this assertion only if we're not already dirty.
 	 */
 	os = dn->dn_objset;
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * Update the accounting.
 		 * Note: we delay "free accounting" until after we drop
 		 * the db_mtx.  This keeps us from grabbing other locks
 		 * (and possibly deadlocking) in bp_get_dsize() while
 		 * also holding the db_mtx.
 		 */
 		dnode_willuse_space(dn, db->db.db_size, tx);
 		do_free_accounting = dbuf_block_freeable(db);
 	}
 
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
 	 * transaction group won't leak out when we sync the older txg.
 	 */
 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
 	if (db->db_level == 0) {
 		void *data_old = db->db_buf;
 
 		if (db->db_state != DB_NOFILL) {
 			if (db->db_blkid == DMU_BONUS_BLKID) {
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db.db_data;
 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
 				/*
 				 * Release the data buffer from the cache so
 				 * that we can modify it without impacting
 				 * possible other users of this cached data
 				 * block.  Note that indirect blocks and
 				 * private objects are not released until the
 				 * syncing state (since they are only modified
 				 * then).
 				 */
 				arc_release(db->db_buf, db);
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db_buf;
 			}
 			ASSERT(data_old != NULL);
 		}
 		dr->dt.dl.dr_data = data_old;
 	} else {
 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
 		list_create(&dr->dt.di.dr_children,
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
 		dr->dr_accounted = db->db.db_size;
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	dr->dr_next = *drp;
 	*drp = dr;
 
 	/*
 	 * We could have been freed_in_flight between the dbuf_noread
 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
 	 * happened after the free.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_free_ranges[txgoff] != NULL) {
 			range_tree_clear(dn->dn_free_ranges[txgoff],
 			    db->db_blkid, 1);
 		}
 		mutex_exit(&dn->dn_mtx);
 		db->db_freed_in_flight = FALSE;
 	}
 
 	/*
 	 * This buffer is now part of this txg
 	 */
 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
 	db->db_dirtycnt += 1;
 	ASSERT3U(db->db_dirtycnt, <=, 3);
 
 	mutex_exit(&db->db_mtx);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
 		DB_DNODE_EXIT(db);
 		return (dr);
 	} else if (do_free_accounting) {
 		blkptr_t *bp = db->db_blkptr;
 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
 		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
 		/*
 		 * This is only a guess -- if the dbuf is dirty
 		 * in a previous txg, we don't know how much
 		 * space it will use on disk yet.  We should
 		 * really have the struct_rwlock to access
 		 * db_blkptr, but since this is just a guess,
 		 * it's OK if we get an odd answer.
 		 */
 		ddt_prefetch(os->os_spa, bp);
 		dnode_willuse_space(dn, -willfree, tx);
 	}
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 
 	if (db->db_level == 0) {
 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
 	}
 
 	if (db->db_level+1 < dn->dn_nlevels) {
 		dmu_buf_impl_t *parent = db->db_parent;
 		dbuf_dirty_record_t *di;
 		int parent_held = FALSE;
 
 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 			parent = dbuf_hold_level(dn, db->db_level+1,
 			    db->db_blkid >> epbs, FTAG);
 			ASSERT(parent != NULL);
 			parent_held = TRUE;
 		}
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 		ASSERT3U(db->db_level+1, ==, parent->db_level);
 		di = dbuf_dirty(parent, tx);
 		if (parent_held)
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
 		/*
 		 * Since we've dropped the mutex, it's possible that
 		 * dbuf_undirty() might have changed this out from under us.
 		 */
 		if (db->db_last_dirty == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
 			ASSERT(!list_link_active(&dr->dr_dirty_node));
 			list_insert_tail(&di->dt.di.dr_children, dr);
 			mutex_exit(&di->dt.di.dr_mtx);
 			dr->dr_parent = di;
 		}
 		mutex_exit(&db->db_mtx);
 	} else {
 		ASSERT(db->db_level+1 == dn->dn_nlevels);
 		ASSERT(db->db_blkid < dn->dn_nblkptr);
 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	dnode_setdirty(dn, tx);
 	DB_DNODE_EXIT(db);
 	return (dr);
 }
 
 /*
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
 static boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	uint64_t txg = tx->tx_txg;
 	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
 
 	/*
 	 * Due to our use of dn_nlevels below, this can only be called
 	 * in open context, unless we are operating on the MOS.
 	 * From syncing context, dn_nlevels may be different from the
 	 * dn_nlevels used when dbuf was dirtied.
 	 */
 	ASSERT(db->db_objset ==
 	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
 	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg <= txg)
 			break;
 	if (dr == NULL || dr->dr_txg < txg)
 		return (B_FALSE);
 	ASSERT(dr->dr_txg == txg);
 	ASSERT(dr->dr_dbuf == db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	ASSERT(db->db.db_size != 0);
 
 	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
 	    dr->dr_accounted, txg);
 
 	*drp = dr->dr_next;
 
 	/*
 	 * Note that there are three places in dbuf_dirty()
 	 * where this dirty record may be put on a list.
 	 * Make sure to do a list_remove corresponding to
 	 * every one of those list_insert calls.
 	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
 	    db->db_level + 1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
 	}
 	DB_DNODE_EXIT(db);
 
 	if (db->db_state != DB_NOFILL) {
 		dbuf_unoverride(dr);
 
 		ASSERT(db->db_buf != NULL);
 		ASSERT(dr->dt.dl.dr_data != NULL);
 		if (dr->dt.dl.dr_data != db->db_buf)
 			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
 	}
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		arc_buf_t *buf = db->db_buf;
 
 		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
 		dbuf_clear_data(db);
 		VERIFY(arc_buf_remove_ref(buf, db));
 		dbuf_evict(db);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 void
 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	/*
 	 * Quick check for dirtyness.  For already dirty blocks, this
 	 * reduces runtime of this function by >90%, and overall performance
 	 * by 50% for some workloads (e.g. file deletion with indirect blocks
 	 * cached).
 	 */
 	mutex_enter(&db->db_mtx);
 	dbuf_dirty_record_t *dr;
 	for (dr = db->db_last_dirty;
 	    dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
 		/*
 		 * It's possible that it is already dirty but not cached,
 		 * because there are some calls to dbuf_dirty() that don't
 		 * go through dmu_buf_will_dirty().
 		 */
 		if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
 			/* This dbuf is already dirty and cached. */
 			dbuf_redirty(dr);
 			mutex_exit(&db->db_mtx);
 			return;
 		}
 	}
 	mutex_exit(&db->db_mtx);
 
 	DB_DNODE_ENTER(db);
 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
 	DB_DNODE_EXIT(db);
 	(void) dbuf_read(db, NULL, rf);
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_state = DB_NOFILL;
 
 	dmu_buf_will_fill(db_fake, tx);
 }
 
 void
 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 #pragma weak dmu_buf_fill_done = dbuf_fill_done
 /* ARGSUSED */
 void
 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			bzero(db->db.db_data, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
 		}
 		db->db_state = DB_CACHED;
 		cv_broadcast(&db->db_changed);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 void
 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	struct dirty_leaf *dl;
 	dmu_object_type_t type;
 
 	if (etype == BP_EMBEDDED_TYPE_DATA) {
 		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
 		    SPA_FEATURE_EMBEDDED_DATA));
 	}
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 	dl = &db->db_last_dirty->dt.dl;
 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
 	    data, comp, uncompressed_size, compressed_size);
 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
 	BP_SET_TYPE(&dl->dr_overridden_by, type);
 	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
 	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
 }
 
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
  */
 void
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db_level == 0);
 	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
 	ASSERT(buf != NULL);
 	ASSERT(arc_buf_size(buf) == db->db.db_size);
 	ASSERT(tx->tx_txg != 0);
 
 	arc_return_buf(buf, db);
 	ASSERT(arc_released(buf));
 
 	mutex_enter(&db->db_mtx);
 
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 
 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
 
 	if (db->db_state == DB_CACHED &&
 	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
 		VERIFY(arc_buf_remove_ref(buf, db));
 		xuio_stat_wbuf_copied();
 		return;
 	}
 
 	xuio_stat_wbuf_nocopy();
 	if (db->db_state == DB_CACHED) {
 		dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 		ASSERT(db->db_buf != NULL);
 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
 			if (!arc_released(db->db_buf)) {
 				ASSERT(dr->dt.dl.dr_override_state ==
 				    DR_OVERRIDDEN);
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
 		}
 		db->db_buf = NULL;
 	}
 	ASSERT(db->db_buf == NULL);
 	dbuf_set_data(db, buf);
 	db->db_state = DB_FILL;
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
 	dmu_buf_fill_done(&db->db, tx);
 }
 
 /*
  * "Clear" the contents of this dbuf.  This will mark the dbuf
  * EVICTING and clear *most* of its references.  Unfortunately,
  * when we are not holding the dn_dbufs_mtx, we can't clear the
  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
  * in this case.  For callers from the DMU we will usually see:
  *	dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
  * For the arc callback, we will usually see:
  *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
  * Sometimes, though, we will get a mix of these two:
  *	DMU: dbuf_clear()->arc_clear_callback()
  *	ARC: dbuf_do_evict()->dbuf_destroy()
  *
  * This routine will dissociate the dbuf from the arc, by calling
  * arc_clear_callback(), but will not evict the data from the ARC.
  */
 void
 dbuf_clear(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
 	dmu_buf_impl_t *dndb;
 	boolean_t dbuf_gone = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	dbuf_evict_user(db);
 
 	if (db->db_state == DB_CACHED) {
 		ASSERT(db->db.db_data != NULL);
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		}
 		db->db.db_data = NULL;
 		db->db_state = DB_UNCACHED;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 
 	db->db_state = DB_EVICTING;
 	db->db_blkptr = NULL;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dndb = dn->dn_dbuf;
 	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
 		avl_remove(&dn->dn_dbufs, db);
 		atomic_dec_32(&dn->dn_dbufs_count);
 		membar_producer();
 		DB_DNODE_EXIT(db);
 		/*
 		 * Decrementing the dbuf count means that the hold corresponding
 		 * to the removed dbuf is no longer discounted in dnode_move(),
 		 * so the dnode cannot be moved until after we release the hold.
 		 * The membar_producer() ensures visibility of the decremented
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
 		dnode_rele(dn, db);
 		db->db_dnode_handle = NULL;
 	} else {
 		DB_DNODE_EXIT(db);
 	}
 
 	if (db->db_buf)
 		dbuf_gone = arc_clear_callback(db->db_buf);
 
 	if (!dbuf_gone)
 		mutex_exit(&db->db_mtx);
 
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
 	if (parent && parent != dndb)
 		dbuf_rele(parent, db);
 }
 
 /*
  * Note: While bpp will always be updated if the function returns success,
  * parentp will not be updated if the dnode does not have dn_dbuf filled in;
  * this happens when the dnode is the meta-dnode, or a userused or groupused
  * object.
  */
 static int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
 {
 	int nlevels, epbs;
 
 	*parentp = NULL;
 	*bpp = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	if (blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_have_spill &&
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 			*bpp = &dn->dn_phys->dn_spill;
 		else
 			*bpp = NULL;
 		dbuf_add_ref(dn->dn_dbuf, NULL);
 		*parentp = dn->dn_dbuf;
 		mutex_exit(&dn->dn_mtx);
 		return (0);
 	}
 
 	if (dn->dn_phys->dn_nlevels == 0)
 		nlevels = 1;
 	else
 		nlevels = dn->dn_phys->dn_nlevels;
 
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	ASSERT3U(level * epbs, <, 64);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	if (level >= nlevels ||
 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
 		return (SET_ERROR(ENOENT));
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err = dbuf_hold_impl(dn, level+1,
 		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err) {
 			dbuf_rele(*parentp, NULL);
 			*parentp = NULL;
 			return (err);
 		}
 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
 		    (blkid & ((1ULL << epbs) - 1));
 		return (0);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
 		    blkid < dn->dn_phys->dn_nblkptr);
 		if (dn->dn_dbuf) {
 			dbuf_add_ref(dn->dn_dbuf, NULL);
 			*parentp = dn->dn_dbuf;
 		}
 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
 		return (0);
 	}
 }
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr)
 {
 	objset_t *os = dn->dn_objset;
 	dmu_buf_impl_t *db, *odb;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
 	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
 
 	db->db_objset = os;
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
 	db->db_last_dirty = NULL;
 	db->db_dirtycnt = 0;
 	db->db_dnode_handle = dn->dn_handle;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 
 	db->db_user = NULL;
 	db->db_user_immediate_evict = FALSE;
 	db->db_freed_in_flight = FALSE;
 	db->db_pending_evict = FALSE;
 
 	if (blkid == DMU_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = DN_MAX_BONUSLEN -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 		return (db);
 	} else if (blkid == DMU_SPILL_BLKID) {
 		db->db.db_size = (blkptr != NULL) ?
 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
 		db->db.db_offset = 0;
 	} else {
 		int blocksize =
 		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
 
 	/*
 	 * Hold the dn_dbufs_mtx while we get the new dbuf
 	 * in the hash table *and* added to the dbufs list.
 	 * This prevents a possible deadlock with someone
 	 * trying to look up this dbuf before its added to the
 	 * dn_dbufs list.
 	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db->db_state = DB_EVICTING;
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
 		kmem_cache_free(dbuf_cache, db);
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return (odb);
 	}
 	avl_add(&dn->dn_dbufs, db);
 	if (db->db_level == 0 && db->db_blkid >=
 	    dn->dn_unlisted_l0_blkid)
 		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
 	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
 
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    refcount_count(&dn->dn_holds) > 0);
 	(void) refcount_add(&dn->dn_holds, db);
 	atomic_inc_32(&dn->dn_dbufs_count);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
 static int
 dbuf_do_evict(void *private)
 {
 	dmu_buf_impl_t *db = private;
 
 	if (!MUTEX_HELD(&db->db_mtx))
 		mutex_enter(&db->db_mtx);
 
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_state != DB_EVICTING) {
 		ASSERT(db->db_state == DB_CACHED);
 		DBUF_VERIFY(db);
 		db->db_buf = NULL;
 		dbuf_evict(db);
 	} else {
 		mutex_exit(&db->db_mtx);
 		dbuf_destroy(db);
 	}
 	return (0);
 }
 
 static void
 dbuf_destroy(dmu_buf_impl_t *db)
 {
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * If this dbuf is still on the dn_dbufs list,
 		 * remove it from that list.
 		 */
 		if (db->db_dnode_handle != NULL) {
 			dnode_t *dn;
 
 			DB_DNODE_ENTER(db);
 			dn = DB_DNODE(db);
 			mutex_enter(&dn->dn_dbufs_mtx);
 			avl_remove(&dn->dn_dbufs, db);
 			atomic_dec_32(&dn->dn_dbufs_count);
 			mutex_exit(&dn->dn_dbufs_mtx);
 			DB_DNODE_EXIT(db);
 			/*
 			 * Decrementing the dbuf count means that the hold
 			 * corresponding to the removed dbuf is no longer
 			 * discounted in dnode_move(), so the dnode cannot be
 			 * moved until after we release the hold.
 			 */
 			dnode_rele(dn, db);
 			db->db_dnode_handle = NULL;
 		}
 		dbuf_hash_remove(db);
 	}
 	db->db_parent = NULL;
 	db->db_buf = NULL;
 
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
 	kmem_cache_free(dbuf_cache, db);
 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 }
 
 typedef struct dbuf_prefetch_arg {
 	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
 	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
 	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
 	int dpa_curlevel; /* The current level that we're reading */
 	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
 	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
 	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
 } dbuf_prefetch_arg_t;
 
 /*
  * Actually issue the prefetch read for the block given.
  */
 static void
 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return;
 
 	arc_flags_t aflags =
 	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
 	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
 	ASSERT(dpa->dpa_zio != NULL);
 	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
 	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 	    &aflags, &dpa->dpa_zb);
 }
 
 /*
  * Called when an indirect block above our prefetch target is read in.  This
  * will either read in the next indirect block down the tree or issue the actual
  * prefetch if the next block down is our target.
  */
 static void
 dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
 {
 	dbuf_prefetch_arg_t *dpa = private;
 
 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
 	ASSERT3S(dpa->dpa_curlevel, >, 0);
 	if (zio != NULL) {
 		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
 		ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
 		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
 	}
 
 	dpa->dpa_curlevel--;
 
 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
 	if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
 		kmem_free(dpa, sizeof (*dpa));
 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
 		dbuf_issue_final_prefetch(dpa, bp);
 		kmem_free(dpa, sizeof (*dpa));
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 
 		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
 		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
 
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 	(void) arc_buf_remove_ref(abuf, private);
 }
 
 /*
  * Issue prefetch reads for the given block on the given level.  If the indirect
  * blocks above that block are not in memory, we will read them in
  * asynchronously.  As a result, this call never blocks waiting for a read to
  * complete.
  */
 void
 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
     arc_flags_t aflags)
 {
 	blkptr_t bp;
 	int epbs, nlevels, curlevel;
 	uint64_t curblkid;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	if (blkid > dn->dn_maxblkid)
 		return;
 
 	if (dnode_block_freed(dn, blkid))
 		return;
 
 	/*
 	 * This dnode hasn't been written to disk yet, so there's nothing to
 	 * prefetch.
 	 */
 	nlevels = dn->dn_phys->dn_nlevels;
 	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
 		return;
 
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
 		return;
 
 	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
 	    level, blkid);
 	if (db != NULL) {
 		mutex_exit(&db->db_mtx);
 		/*
 		 * This dbuf already exists.  It is either CACHED, or
 		 * (we assume) about to be read or filled.
 		 */
 		return;
 	}
 
 	/*
 	 * Find the closest ancestor (indirect block) of the target block
 	 * that is present in the cache.  In this indirect block, we will
 	 * find the bp that is at curlevel, curblkid.
 	 */
 	curlevel = level;
 	curblkid = blkid;
 	while (curlevel < nlevels - 1) {
 		int parent_level = curlevel + 1;
 		uint64_t parent_blkid = curblkid >> epbs;
 		dmu_buf_impl_t *db;
 
 		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
 		    FALSE, TRUE, FTAG, &db) == 0) {
 			blkptr_t *bpp = db->db_buf->b_data;
 			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
 			dbuf_rele(db, FTAG);
 			break;
 		}
 
 		curlevel = parent_level;
 		curblkid = parent_blkid;
 	}
 
 	if (curlevel == nlevels - 1) {
 		/* No cached indirect blocks found. */
 		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
 		bp = dn->dn_phys->dn_blkptr[curblkid];
 	}
 	if (BP_IS_HOLE(&bp))
 		return;
 
 	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
 
 	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
 	    ZIO_FLAG_CANFAIL);
 
 	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 	    dn->dn_object, level, blkid);
 	dpa->dpa_curlevel = curlevel;
 	dpa->dpa_prio = prio;
 	dpa->dpa_aflags = aflags;
 	dpa->dpa_spa = dn->dn_objset->os_spa;
 	dpa->dpa_epbs = epbs;
 	dpa->dpa_zio = pio;
 
 	/*
 	 * If we have the indirect just above us, no need to do the asynchronous
 	 * prefetch chain; we'll just run the last step ourselves.  If we're at
 	 * a higher level, though, we want to issue the prefetches for all the
 	 * indirect blocks asynchronously, so we can go on with whatever we were
 	 * doing.
 	 */
 	if (curlevel == level) {
 		ASSERT3U(curblkid, ==, blkid);
 		dbuf_issue_final_prefetch(dpa, &bp);
 		kmem_free(dpa, sizeof (*dpa));
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 		    dn->dn_object, curlevel, curblkid);
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 	/*
 	 * We use pio here instead of dpa_zio since it's possible that
 	 * dpa may have already been freed.
 	 */
 	zio_nowait(pio);
 }
 
 /*
  * Returns with db_holds incremented, and db_mtx not held.
  * Note: dn_struct_rwlock must be held.
  */
 int
 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
     boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
 top:
 	/* dbuf_find() returns with db_mtx held */
 	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
 		int err;
 
 		if (fail_uncached)
 			return (SET_ERROR(ENOENT));
 
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
 			if (err == 0 && bp && BP_IS_HOLE(bp))
 				err = SET_ERROR(ENOENT);
 			if (err) {
 				if (parent)
 					dbuf_rele(parent, NULL);
 				return (err);
 			}
 		}
 		if (err && err != ENOENT)
 			return (err);
 		db = dbuf_create(dn, level, blkid, parent, bp);
 	}
 
 	if (fail_uncached && db->db_state != DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
 		arc_buf_add_ref(db->db_buf, db);
 		if (db->db_buf->b_data == NULL) {
 			dbuf_clear(db);
 			if (parent) {
 				dbuf_rele(parent, NULL);
 				parent = NULL;
 			}
 			goto top;
 		}
 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
 	}
 
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
 	 * If this buffer is currently syncing out, and we are are
 	 * still referencing it from db_data, we need to make a copy
 	 * of it in case we decide we want to dirty it again in this txg.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_state == DB_CACHED && db->db_data_pending) {
 		dbuf_dirty_record_t *dr = db->db_data_pending;
 
 		if (dr->dt.dl.dr_data == db->db_buf) {
 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 			dbuf_set_data(db,
 			    arc_buf_alloc(dn->dn_objset->os_spa,
 			    db->db.db_size, db, type));
 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
 			    db->db.db_size);
 		}
 	}
 
 	(void) refcount_add(&db->db_holds, tag);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
 	if (parent)
 		dbuf_rele(parent, NULL);
 
 	ASSERT3P(DB_DNODE(db), ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
 	ASSERT3U(db->db_level, ==, level);
 	*dbp = db;
 
 	return (0);
 }
 
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
 	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
 void
 dbuf_create_bonus(dnode_t *dn)
 {
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
 }
 
 int
 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	if (db->db_blkid != DMU_SPILL_BLKID)
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
 	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
 	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dbuf_new_size(db, blksz, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	DB_DNODE_EXIT(db);
 
 	return (0);
 }
 
 void
 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds = refcount_add(&db->db_holds, tag);
 	ASSERT(holds > 1);
 }
 
 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
 boolean_t
 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
     void *tag)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dmu_buf_impl_t *found_db;
 	boolean_t result = B_FALSE;
 
 	if (db->db_blkid == DMU_BONUS_BLKID)
 		found_db = dbuf_find_bonus(os, obj);
 	else
 		found_db = dbuf_find(os, obj, 0, blkid);
 
 	if (found_db != NULL) {
 		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
 			(void) refcount_add(&db->db_holds, tag);
 			result = B_TRUE;
 		}
 		mutex_exit(&db->db_mtx);
 	}
 	return (result);
 }
 
 /*
  * If you call dbuf_rele() you had better not be referencing the dnode handle
  * unless you have some other direct or indirect hold on the dnode. (An indirect
  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
  * dnode's parent dbuf evicting its dnode handles.
  */
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	mutex_enter(&db->db_mtx);
 	dbuf_rele_and_unlock(db, tag);
 }
 
 void
 dmu_buf_rele(dmu_buf_t *db, void *tag)
 {
 	dbuf_rele((dmu_buf_impl_t *)db, tag);
 }
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.
  */
 void
 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
 	/*
 	 * Remove the reference to the dbuf before removing its hold on the
 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
 	 * buffer has a corresponding dnode hold.
 	 */
 	holds = refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
 	/*
 	 * We can't freeze indirects if there is a possibility that they
 	 * may be modified in the current syncing context.
 	 */
 	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
 		arc_buf_freeze(db->db_buf);
 
 	if (holds == db->db_dirtycnt &&
 	    db->db_level == 0 && db->db_user_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			dnode_t *dn;
 			boolean_t evict_dbuf = db->db_pending_evict;
 
 			/*
 			 * If the dnode moves here, we cannot cross this
 			 * barrier until the move completes.
 			 */
 			DB_DNODE_ENTER(db);
 
 			dn = DB_DNODE(db);
 			atomic_dec_32(&dn->dn_dbufs_count);
 
 			/*
 			 * Decrementing the dbuf count means that the bonus
 			 * buffer's dnode hold is no longer discounted in
 			 * dnode_move(). The dnode cannot move until after
 			 * the dnode_rele() below.
 			 */
 			DB_DNODE_EXIT(db);
 
 			/*
 			 * Do not reference db after its lock is dropped.
 			 * Another thread may evict it.
 			 */
 			mutex_exit(&db->db_mtx);
 
 			if (evict_dbuf)
 				dnode_evict_bonus(dn);
 
 			dnode_rele(dn, db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
 			 * dbuf with any data allocated from the ARC.
 			 */
 			ASSERT(db->db_state == DB_UNCACHED ||
 			    db->db_state == DB_NOFILL);
 			dbuf_evict(db);
 		} else if (arc_released(db->db_buf)) {
 			arc_buf_t *buf = db->db_buf;
 			/*
 			 * This dbuf has anonymous data associated with it.
 			 */
 			dbuf_clear_data(db);
 			VERIFY(arc_buf_remove_ref(buf, db));
 			dbuf_evict(db);
 		} else {
 			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
 
 			/*
 			 * A dbuf will be eligible for eviction if either the
 			 * 'primarycache' property is set or a duplicate
 			 * copy of this buffer is already cached in the arc.
 			 *
 			 * In the case of the 'primarycache' a buffer
 			 * is considered for eviction if it matches the
 			 * criteria set in the property.
 			 *
 			 * To decide if our buffer is considered a
 			 * duplicate, we must call into the arc to determine
 			 * if multiple buffers are referencing the same
 			 * block on-disk. If so, then we simply evict
 			 * ourselves.
 			 */
 			if (!DBUF_IS_CACHEABLE(db)) {
 				if (db->db_blkptr != NULL &&
 				    !BP_IS_HOLE(db->db_blkptr) &&
 				    !BP_IS_EMBEDDED(db->db_blkptr)) {
 					spa_t *spa =
 					    dmu_objset_spa(db->db_objset);
 					blkptr_t bp = *db->db_blkptr;
 					dbuf_clear(db);
 					arc_freed(spa, &bp);
 				} else {
 					dbuf_clear(db);
 				}
 			} else if (db->db_pending_evict ||
 			    arc_buf_eviction_needed(db->db_buf)) {
 				dbuf_clear(db);
 			} else {
 				mutex_exit(&db->db_mtx);
 			}
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
 uint64_t
 dbuf_refcount(dmu_buf_impl_t *db)
 {
 	return (refcount_count(&db->db_holds));
 }
 
 void *
 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
     dmu_buf_user_t *new_user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	if (db->db_user == old_user)
 		db->db_user = new_user;
 	else
 		old_user = db->db_user;
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	mutex_exit(&db->db_mtx);
 
 	return (old_user);
 }
 
 void *
 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, NULL, user));
 }
 
 void *
 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_user_immediate_evict = TRUE;
 	return (dmu_buf_set_user(db_fake, user));
 }
 
 void *
 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, user, NULL));
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	return (db->db_user);
 }
 
 void
 dmu_buf_user_evict_wait()
 {
 	taskq_wait(dbu_evict_taskq);
 }
 
 boolean_t
 dmu_buf_freeable(dmu_buf_t *dbuf)
 {
 	boolean_t res = B_FALSE;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 
 	if (db->db_blkptr)
 		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
 		    db->db_blkptr, db->db_blkptr->blk_birth);
 
 	return (res);
 }
 
 blkptr_t *
 dmu_buf_get_blkptr(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_blkptr);
 }
 
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	/* ASSERT(dmu_tx_is_syncing(tx) */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_blkptr != NULL)
 		return;
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		db->db_blkptr = &dn->dn_phys->dn_spill;
 		BP_ZERO(db->db_blkptr);
 		return;
 	}
 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
 		/*
 		 * This buffer was allocated at a time when there was
 		 * no available blkptrs from the dnode, or it was
 		 * inappropriate to hook it in (i.e., nlevels mis-match).
 		 */
 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
 		ASSERT(db->db_parent == NULL);
 		db->db_parent = dn->dn_dbuf;
 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
 		DBUF_VERIFY(db);
 	} else {
 		dmu_buf_impl_t *parent = db->db_parent;
 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 		ASSERT(dn->dn_phys->dn_nlevels > 1);
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			parent = dbuf_hold_level(dn, db->db_level + 1,
 			    db->db_blkid >> epbs, db);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
 		}
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
 		DBUF_VERIFY(db);
 	}
 }
 
 static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	zio_t *zio;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 
 	ASSERT(db->db_level > 0);
 	DBUF_VERIFY(db);
 
 	/* Read the block if it hasn't been read yet. */
 	if (db->db_buf == NULL) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 		mutex_enter(&db->db_mtx);
 	}
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 	ASSERT(db->db_buf != NULL);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/* Indirect block size must match what the dnode thinks it is. */
 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	dbuf_check_blkptr(dn, db);
 	DB_DNODE_EXIT(db);
 
 	/* Provide the pending dirty record to child dbufs */
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 	dbuf_write(dr, db->db_buf, tx);
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
 	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
 }
 
 static void
 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	objset_t *os;
 	uint64_t txg = tx->tx_txg;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * To be synced, we must be dirtied.  But we
 	 * might have been freed after the dirty.
 	 */
 	if (db->db_state == DB_UNCACHED) {
 		/* This buffer has been freed since it was dirtied */
 		ASSERT(db->db.db_data == NULL);
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
 	DBUF_VERIFY(db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	/*
 	 * If this is a bonus buffer, simply copy the bonus data into the
 	 * dnode.  It will be written out when the dnode is synced (and it
 	 * will be synced, since it must have been dirty for dbuf_sync to
 	 * be called).
 	 */
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		dbuf_dirty_record_t **drp;
 
 		ASSERT(*datap != NULL);
 		ASSERT0(db->db_level);
 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
 		DB_DNODE_EXIT(db);
 
 		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		}
 		db->db_data_pending = NULL;
 		drp = &db->db_last_dirty;
 		while (*drp != dr)
 			drp = &(*drp)->dr_next;
 		ASSERT(dr->dr_next == NULL);
 		ASSERT(dr->dr_dbuf == db);
 		*drp = dr->dr_next;
 		if (dr->dr_dbuf->db_level != 0) {
 			list_destroy(&dr->dt.di.dr_children);
 			mutex_destroy(&dr->dt.di.dr_mtx);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 		return;
 	}
 
 	os = dn->dn_objset;
 
 	/*
 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
 	 * operation to sneak in. As a result, we need to ensure that we
 	 * don't check the dr_override_state until we have returned from
 	 * dbuf_check_blkptr.
 	 */
 	dbuf_check_blkptr(dn, db);
 
 	/*
 	 * If this buffer is in the middle of an immediate write,
 	 * wait for the synchronous IO to complete.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		cv_wait(&db->db_changed, &db->db_mtx);
 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
 	}
 
 	if (db->db_state != DB_NOFILL &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    refcount_count(&db->db_holds) > 1 &&
 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
 	    *datap == db->db_buf) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
 		 * then make a copy before we start the write so that
 		 * any modifications from the open txg will not leak
 		 * into this write.
 		 *
 		 * NOTE: this copy does not need to be made for
 		 * objects only modified in the syncing context (e.g.
 		 * DNONE_DNODE blocks).
 		 */
 		int blksz = arc_buf_size(*datap);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
 		bcopy(db->db.db_data, (*datap)->b_data, blksz);
 	}
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
 		DB_DNODE_EXIT(db);
 	} else {
 		/*
 		 * Although zio_nowait() does not "wait for an IO", it does
 		 * initiate the IO. If this is an empty write it seems plausible
 		 * that the IO could actually be completed before the nowait
 		 * returns. We need to DB_DNODE_EXIT() first in case
 		 * zio_nowait() invalidates the dbuf.
 		 */
 		DB_DNODE_EXIT(db);
 		zio_nowait(dr->dr_zio);
 	}
 }
 
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
 	while (dr = list_head(list)) {
 		if (dr->dr_zio != NULL) {
 			/*
 			 * If we find an already initialized zio then we
 			 * are processing the meta-dnode, and we have finished.
 			 * The dbufs for all dnodes are put back on the list
 			 * during processing, so that we can zio_wait()
 			 * these IOs after initiating all child IOs.
 			 */
 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
 		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
 		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
 			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
 		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf->db_level > 0)
 			dbuf_sync_indirect(dr, tx);
 		else
 			dbuf_sync_leaf(dr, tx);
 	}
 }
 
 /* ARGSUSED */
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	spa_t *spa = zio->io_spa;
 	int64_t delta;
 	uint64_t fill = 0;
 	int i;
 
 	ASSERT3P(db->db_blkptr, ==, bp);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
 	if (bp->blk_birth != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
 		    BP_IS_EMBEDDED(bp));
 		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 	}
 #endif
 
 	if (db->db_level == 0) {
 		mutex_enter(&dn->dn_mtx);
 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
 		    db->db_blkid != DMU_SPILL_BLKID)
 			dn->dn_phys->dn_maxblkid = db->db_blkid;
 		mutex_exit(&dn->dn_mtx);
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			dnode_phys_t *dnp = db->db.db_data;
 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
 			    i--, dnp++) {
 				if (dnp->dn_type != DMU_OT_NONE)
 					fill++;
 			}
 		} else {
 			if (BP_IS_HOLE(bp)) {
 				fill = 0;
 			} else {
 				fill = 1;
 			}
 		}
 	} else {
 		blkptr_t *ibp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
 			fill += BP_GET_FILL(ibp);
 		}
 	}
 	DB_DNODE_EXIT(db);
 
 	if (!BP_IS_EMBEDDED(bp))
 		bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
 }
 
 /*
  * The SPA will call this callback several times for each zio - once
  * for every physical child i/o (zio->io_phys_children times).  This
  * allows the DMU to monitor the progress of each logical i/o.  For example,
  * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
  * block.  There may be a long delay before all copies/fragments are completed,
  * so this callback allows us to retire dirty space gradually, as the physical
  * i/os complete.
  */
 /* ARGSUSED */
 static void
 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	dmu_buf_impl_t *db = arg;
 	objset_t *os = db->db_objset;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 	dbuf_dirty_record_t *dr;
 	int delta = 0;
 
 	dr = db->db_data_pending;
 	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
 
 	/*
 	 * The callback will be called io_phys_children times.  Retire one
 	 * portion of our dirty space each time we are called.  Any rounding
 	 * error will be cleaned up by dsl_pool_sync()'s call to
 	 * dsl_pool_undirty_space().
 	 */
 	delta = dr->dr_accounted / zio->io_phys_children;
 	dsl_pool_undirty_space(dp, delta, zio->io_txg);
 }
 
 /* ARGSUSED */
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	blkptr_t *bp = db->db_blkptr;
 	objset_t *os = db->db_objset;
 	dmu_tx_t *tx = os->os_synctx;
 	dbuf_dirty_record_t **drp, *dr;
 
 	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
 	/*
 	 * For nopwrites and rewrites we ensure that the bp matches our
 	 * original and bypass all the accounting.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 	DBUF_VERIFY(db);
 
 	drp = &db->db_last_dirty;
 	while ((dr = *drp) != db->db_data_pending)
 		drp = &dr->dr_next;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	ASSERT(dr->dr_dbuf == db);
 	ASSERT(dr->dr_next == NULL);
 	*drp = dr->dr_next;
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 		DB_DNODE_EXIT(db);
 	}
 #endif
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != db->db_buf)
 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
 				    db));
 			else if (!arc_released(db->db_buf))
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 	} else {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
 			int epbs =
 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_blkid, <=,
 			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 			    db->db.db_size);
 			if (!arc_released(db->db_buf))
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 		DB_DNODE_EXIT(db);
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	cv_broadcast(&db->db_changed);
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
 }
 
 static void
 dbuf_write_nofill_ready(zio_t *zio)
 {
 	dbuf_write_ready(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_nofill_done(zio_t *zio)
 {
 	dbuf_write_done(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_override_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	dbuf_write_ready(zio, NULL, db);
 }
 
 static void
 dbuf_write_override_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
 
 	mutex_enter(&db->db_mtx);
 	if (!BP_EQUAL(zio->io_bp, obp)) {
 		if (!BP_IS_HOLE(obp))
 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
 		arc_release(dr->dt.dl.dr_data, db);
 	}
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write_done(zio, NULL, db);
 }
 
 /* Issue I/O to commit a dirty buffer to disk. */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	int wp_flag = 0;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	os = dn->dn_objset;
 
 	if (db->db_state != DB_NOFILL) {
 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
 			/*
 			 * Private object buffers are released here rather
 			 * than in dbuf_dirty() since they are only modified
 			 * in the syncing context and we don't want the
 			 * overhead of making multiple copies of the data.
 			 */
 			if (BP_IS_HOLE(db->db_blkptr)) {
 				arc_buf_thaw(data);
 			} else {
 				dbuf_release_bp(db);
 			}
 		}
 	}
 
 	if (parent != dn->dn_dbuf) {
 		/* Our parent is an indirect block. */
 		/* We have a dirty parent that has been scheduled for write. */
 		ASSERT(parent && parent->db_data_pending);
 		/* Our parent's buffer is one level closer to the dnode. */
 		ASSERT(db->db_level == parent->db_level-1);
 		/*
 		 * We're about to modify our parent's db_data by modifying
 		 * our block pointer, so the parent must be released.
 		 */
 		ASSERT(arc_released(parent->db_buf));
 		zio = parent->db_data_pending->dr_zio;
 	} else {
 		/* Our parent is the dnode itself. */
 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
 		    db->db_blkid != DMU_SPILL_BLKID) ||
 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
 		if (db->db_blkid != DMU_SPILL_BLKID)
 			ASSERT3P(db->db_blkptr, ==,
 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		zio = dn->dn_zio;
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
 	ASSERT(zio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 	DB_DNODE_EXIT(db);
 
 	if (db->db_level == 0 &&
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * The BP for this block has been provided by open context
 		 * (by dmu_sync() or dmu_buf_write_embedded()).
 		 */
 		void *contents = (data != NULL) ? data->b_data : NULL;
 
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, contents, db->db.db_size, &zp,
 		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, NULL, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
 		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
 		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
 		    dbuf_write_physdone, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	(revision 296519)
@@ -1,1987 +1,1988 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/cred.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dnode.h>
 #include <sys/dbuf.h>
 #include <sys/zvol.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
 #include <sys/vdev.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
  * before it can be safely accessed.
  */
 krwlock_t os_lock;
 
 /*
  * Tunable to overwrite the maximum number of threads for the parallization
  * of dmu_objset_find_dp, needed to speed up the import of pools with many
  * datasets.
  * Default is 4 times the number of leaf vdevs.
  */
 int dmu_find_threads = 0;
 
 static void dmu_objset_find_dp_cb(void *arg);
 
 void
 dmu_objset_init(void)
 {
 	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
 }
 
 void
 dmu_objset_fini(void)
 {
 	rw_destroy(&os_lock);
 }
 
 spa_t *
 dmu_objset_spa(objset_t *os)
 {
 	return (os->os_spa);
 }
 
 zilog_t *
 dmu_objset_zil(objset_t *os)
 {
 	return (os->os_zil);
 }
 
 dsl_pool_t *
 dmu_objset_pool(objset_t *os)
 {
 	dsl_dataset_t *ds;
 
 	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
 		return (ds->ds_dir->dd_pool);
 	else
 		return (spa_get_dsl(os->os_spa));
 }
 
 dsl_dataset_t *
 dmu_objset_ds(objset_t *os)
 {
 	return (os->os_dsl_dataset);
 }
 
 dmu_objset_type_t
 dmu_objset_type(objset_t *os)
 {
 	return (os->os_phys->os_type);
 }
 
 void
 dmu_objset_name(objset_t *os, char *buf)
 {
 	dsl_dataset_name(os->os_dsl_dataset, buf);
 }
 
 uint64_t
 dmu_objset_id(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	return (ds ? ds->ds_object : 0);
 }
 
 zfs_sync_type_t
 dmu_objset_syncprop(objset_t *os)
 {
 	return (os->os_sync);
 }
 
 zfs_logbias_op_t
 dmu_objset_logbias(objset_t *os)
 {
 	return (os->os_logbias);
 }
 
 static void
 checksum_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 }
 
 static void
 compression_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
 	os->os_compress = zio_compress_select(os->os_spa, newval,
 	    ZIO_COMPRESS_ON);
 }
 
 static void
 copies_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval > 0);
 	ASSERT(newval <= spa_max_replication(os->os_spa));
 
 	os->os_copies = newval;
 }
 
 static void
 dedup_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 	spa_t *spa = os->os_spa;
 	enum zio_checksum checksum;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
 
 	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
 	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 }
 
 static void
 primary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_primary_cache = newval;
 }
 
 static void
 secondary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_secondary_cache = newval;
 }
 
 static void
 sync_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
 	    newval == ZFS_SYNC_DISABLED);
 
 	os->os_sync = newval;
 	if (os->os_zil)
 		zil_set_sync(os->os_zil, newval);
 }
 
 static void
 redundant_metadata_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
 	    newval == ZFS_REDUNDANT_METADATA_MOST);
 
 	os->os_redundant_metadata = newval;
 }
 
 static void
 logbias_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
 	    newval == ZFS_LOGBIAS_THROUGHPUT);
 	os->os_logbias = newval;
 	if (os->os_zil)
 		zil_set_logbias(os->os_zil, newval);
 }
 
 static void
 recordsize_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	os->os_recordsize = newval;
 }
 
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
 	objset_phys_t *osp = buf;
 
 	ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
 	dnode_byteswap(&osp->os_meta_dnode);
 	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 	osp->os_type = BSWAP_64(osp->os_type);
 	osp->os_flags = BSWAP_64(osp->os_flags);
 	if (size == sizeof (objset_phys_t)) {
 		dnode_byteswap(&osp->os_userused_dnode);
 		dnode_byteswap(&osp->os_groupused_dnode);
 	}
 }
 
 int
 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     objset_t **osp)
 {
 	objset_t *os;
 	int i, err;
 
 	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
 	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
 	os->os_dsl_dataset = ds;
 	os->os_spa = spa;
 	os->os_rootbp = bp;
 	if (!BP_IS_HOLE(os->os_rootbp)) {
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		zbookmark_phys_t zb;
 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 		if (DMU_OS_IS_L2CACHEABLE(os))
 			aflags |= ARC_FLAG_L2CACHE;
 		if (DMU_OS_IS_L2COMPRESSIBLE(os))
 			aflags |= ARC_FLAG_L2COMPRESS;
 
 		dprintf_bp(os->os_rootbp, "reading %s", "");
 		err = arc_read(NULL, spa, os->os_rootbp,
 		    arc_getbuf_func, &os->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 		if (err != 0) {
 			kmem_free(os, sizeof (objset_t));
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = SET_ERROR(EIO);
 			return (err);
 		}
 
 		/* Increase the blocksize if we are permitted. */
 		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
 			arc_buf_t *buf = arc_buf_alloc(spa,
 			    sizeof (objset_phys_t), &os->os_phys_buf,
 			    ARC_BUFC_METADATA);
 			bzero(buf->b_data, sizeof (objset_phys_t));
 			bcopy(os->os_phys_buf->b_data, buf->b_data,
 			    arc_buf_size(os->os_phys_buf));
 			(void) arc_buf_remove_ref(os->os_phys_buf,
 			    &os->os_phys_buf);
 			os->os_phys_buf = buf;
 		}
 
 		os->os_phys = os->os_phys_buf->b_data;
 		os->os_flags = os->os_phys->os_flags;
 	} else {
 		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
 		os->os_phys_buf = arc_buf_alloc(spa, size,
 		    &os->os_phys_buf, ARC_BUFC_METADATA);
 		os->os_phys = os->os_phys_buf->b_data;
 		bzero(os->os_phys, size);
 	}
 
 	/*
 	 * Note: the changed_cb will be called once before the register
 	 * func returns, thus changing the checksum/compression from the
 	 * default (fletcher2/off).  Snapshots don't need to know about
 	 * checksum/compression/copies.
 	 */
 	if (ds != NULL) {
 		boolean_t needlock = B_FALSE;
 
 		/*
 		 * Note: it's valid to open the objset if the dataset is
 		 * long-held, in which case the pool_config lock will not
 		 * be held.
 		 */
 		if (!dsl_pool_config_held(dmu_objset_pool(os))) {
 			needlock = B_TRUE;
 			dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 		}
 		err = dsl_prop_register(ds,
 		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os);
 		if (err == 0) {
 			err = dsl_prop_register(ds,
 			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 			    secondary_cache_changed_cb, os);
 		}
 		if (!ds->ds_is_snapshot) {
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 				    checksum_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    compression_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_COPIES),
 				    copies_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_DEDUP),
 				    dedup_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 				    logbias_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_SYNC),
 				    sync_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(
 				    ZFS_PROP_REDUNDANT_METADATA),
 				    redundant_metadata_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 				    recordsize_changed_cb, os);
 			}
 		}
 		if (needlock)
 			dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		if (err != 0) {
 			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
 			    &os->os_phys_buf));
 			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
 	} else {
 		/* It's the meta-objset. */
 		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
 		os->os_compress = ZIO_COMPRESS_ON;
 		os->os_copies = spa_max_replication(spa);
 		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
 		os->os_dedup_verify = B_FALSE;
 		os->os_logbias = ZFS_LOGBIAS_LATENCY;
 		os->os_sync = ZFS_SYNC_STANDARD;
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
 
 	if (ds == NULL || !ds->ds_is_snapshot)
 		os->os_zil_header = os->os_phys->os_zil_header;
 	os->os_zil = zil_alloc(os, &os->os_zil_header);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]));
 		list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]));
 	}
 	list_create(&os->os_dnodes, sizeof (dnode_t),
 	    offsetof(dnode_t, dn_link));
 	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	dnode_special_open(os, &os->os_phys->os_meta_dnode,
 	    DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
 	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
 		dnode_special_open(os, &os->os_phys->os_userused_dnode,
 		    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
 		dnode_special_open(os, &os->os_phys->os_groupused_dnode,
 		    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
 	}
 
 	*osp = os;
 	return (0);
 }
 
 int
 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 {
 	int err = 0;
 
 	/*
 	 * We shouldn't be doing anything with dsl_dataset_t's unless the
 	 * pool_config lock is held, or the dataset is long-held.
 	 */
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
 	    dsl_dataset_long_held(ds));
 
 	mutex_enter(&ds->ds_opening_lock);
 	if (ds->ds_objset == NULL) {
 		objset_t *os;
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
 		    ds, dsl_dataset_get_blkptr(ds), &os);
 
 		if (err == 0) {
 			mutex_enter(&ds->ds_lock);
 			ASSERT(ds->ds_objset == NULL);
 			ds->ds_objset = os;
 			mutex_exit(&ds->ds_lock);
 		}
 	}
 	*osp = ds->ds_objset;
 	mutex_exit(&ds->ds_opening_lock);
 	return (err);
 }
 
 /*
  * Holds the pool while the objset is held.  Therefore only one objset
  * can be held at a time.
  */
 int
 dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_pool_hold(name, tag, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_hold(dp, name, tag, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
 	}
 
 	err = dmu_objset_from_ds(ds, osp);
 	if (err != 0) {
 		dsl_dataset_rele(ds, tag);
 		dsl_pool_rele(dp, tag);
 	}
 
 	return (err);
 }
 
 static int
 dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
 	int err;
 
 	err = dmu_objset_from_ds(ds, osp);
 	if (err != 0) {
 		dsl_dataset_disown(ds, tag);
 	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 		dsl_dataset_disown(ds, tag);
 		return (SET_ERROR(EINVAL));
 	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
 		dsl_dataset_disown(ds, tag);
 		return (SET_ERROR(EROFS));
 	}
 	return (err);
 }
 
 /*
  * dsl_pool must not be held when this is called.
  * Upon successful return, there will be a longhold on the dataset,
  * and the dsl_pool will not be held.
  */
 int
 dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_pool_hold(name, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_own(dp, name, tag, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 	err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
 	dsl_pool_rele(dp, FTAG);
 
 	return (err);
 }
 
 int
 dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_dataset_own_obj(dp, obj, tag, &ds);
 	if (err != 0)
 		return (err);
 
 	return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
 }
 
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
 	dsl_pool_t *dp = dmu_objset_pool(os);
 	dsl_dataset_rele(os->os_dsl_dataset, tag);
 	dsl_pool_rele(dp, tag);
 }
 
 /*
  * When we are called, os MUST refer to an objset associated with a dataset
  * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
  * == tag.  We will then release and reacquire ownership of the dataset while
  * holding the pool config_rwlock to avoid intervening namespace or ownership
  * changes may occur.
  *
  * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
  * release the hold on its dataset and acquire a new one on the dataset of the
  * same name so that it can be partially torn down and reconstructed.
  */
 void
 dmu_objset_refresh_ownership(objset_t *os, void *tag)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds, *newds;
 	char name[MAXNAMELEN];
 
 	ds = os->os_dsl_dataset;
 	VERIFY3P(ds, !=, NULL);
 	VERIFY3P(ds->ds_owner, ==, tag);
 	VERIFY(dsl_dataset_long_held(ds));
 
 	dsl_dataset_name(ds, name);
 	dp = dmu_objset_pool(os);
 	dsl_pool_config_enter(dp, FTAG);
 	dmu_objset_disown(os, tag);
 	VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
 	VERIFY3P(newds, ==, os->os_dsl_dataset);
 	dsl_pool_config_exit(dp, FTAG);
 }
 
 void
 dmu_objset_disown(objset_t *os, void *tag)
 {
 	dsl_dataset_disown(os->os_dsl_dataset, tag);
 }
 
 void
 dmu_objset_evict_dbufs(objset_t *os)
 {
 	dnode_t dn_marker;
 	dnode_t *dn;
 
 	mutex_enter(&os->os_lock);
 	dn = list_head(&os->os_dnodes);
 	while (dn != NULL) {
 		/*
 		 * Skip dnodes without holds.  We have to do this dance
 		 * because dnode_add_ref() only works if there is already a
 		 * hold.  If the dnode has no holds, then it has no dbufs.
 		 */
 		if (dnode_add_ref(dn, FTAG)) {
 			list_insert_after(&os->os_dnodes, dn, &dn_marker);
 			mutex_exit(&os->os_lock);
 
 			dnode_evict_dbufs(dn);
 			dnode_rele(dn, FTAG);
 
 			mutex_enter(&os->os_lock);
 			dn = list_next(&os->os_dnodes, &dn_marker);
 			list_remove(&os->os_dnodes, &dn_marker);
 		} else {
 			dn = list_next(&os->os_dnodes, dn);
 		}
 	}
 	mutex_exit(&os->os_lock);
 
 	if (DMU_USERUSED_DNODE(os) != NULL) {
 		dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
 		dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
 	}
 	dnode_evict_dbufs(DMU_META_DNODE(os));
 }
 
 /*
  * Objset eviction processing is split into into two pieces.
  * The first marks the objset as evicting, evicts any dbufs that
  * have a refcount of zero, and then queues up the objset for the
  * second phase of eviction.  Once os->os_dnodes has been cleared by
  * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
  * The second phase closes the special dnodes, dequeues the objset from
  * the list of those undergoing eviction, and finally frees the objset.
  *
  * NOTE: Due to asynchronous eviction processing (invocation of
  *       dnode_buf_pageout()), it is possible for the meta dnode for the
  *       objset to have no holds even though os->os_dnodes is not empty.
  */
 void
 dmu_objset_evict(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!dmu_objset_is_dirty(os, t));
 
 	if (ds)
 		dsl_prop_unregister_all(ds, os);
 
 	if (os->os_sa)
 		sa_tear_down(os);
 
 	dmu_objset_evict_dbufs(os);
 
 	mutex_enter(&os->os_lock);
 	spa_evicting_os_register(os->os_spa, os);
 	if (list_is_empty(&os->os_dnodes)) {
 		mutex_exit(&os->os_lock);
 		dmu_objset_evict_done(os);
 	} else {
 		mutex_exit(&os->os_lock);
 	}
 }
 
 void
 dmu_objset_evict_done(objset_t *os)
 {
 	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
 	dnode_special_close(&os->os_meta_dnode);
 	if (DMU_USERUSED_DNODE(os)) {
 		dnode_special_close(&os->os_userused_dnode);
 		dnode_special_close(&os->os_groupused_dnode);
 	}
 	zil_free(os->os_zil);
 
 	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 
 	/*
 	 * This is a barrier to prevent the objset from going away in
 	 * dnode_move() until we can safely ensure that the objset is still in
 	 * use. We consider the objset valid before the barrier and invalid
 	 * after the barrier.
 	 */
 	rw_enter(&os_lock, RW_READER);
 	rw_exit(&os_lock);
 
 	mutex_destroy(&os->os_lock);
 	mutex_destroy(&os->os_obj_lock);
 	mutex_destroy(&os->os_user_ptr_lock);
 	spa_evicting_os_deregister(os->os_spa, os);
 	kmem_free(os, sizeof (objset_t));
 }
 
 timestruc_t
 dmu_objset_snap_cmtime(objset_t *os)
 {
 	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 }
 
 /* called from dsl for meta-objset */
 objset_t *
 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_objset_type_t type, dmu_tx_t *tx)
 {
 	objset_t *os;
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	if (ds != NULL)
 		VERIFY0(dmu_objset_from_ds(ds, &os));
 	else
 		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
 
 	mdn = DMU_META_DNODE(os);
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
 	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
 
 	/*
 	 * We don't want to have to increase the meta-dnode's nlevels
 	 * later, because then we could do it in quescing context while
 	 * we are also accessing it in open context.
 	 *
 	 * This precaution is not necessary for the MOS (ds == NULL),
 	 * because the MOS is only updated in syncing context.
 	 * This is most fortunate: the MOS is the only objset that
 	 * needs to be synced multiple times as spa_sync() iterates
 	 * to convergence, so minimizing its dn_nlevels matters.
 	 */
 	if (ds != NULL) {
 		int levels = 1;
 
 		/*
 		 * Determine the number of levels necessary for the meta-dnode
 		 * to contain DN_MAX_OBJECT dnodes.
 		 */
 		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
 		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
 		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
 			levels++;
 
 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
 		    mdn->dn_nlevels = levels;
 	}
 
 	ASSERT(type != DMU_OST_NONE);
 	ASSERT(type != DMU_OST_ANY);
 	ASSERT(type < DMU_OST_NUMTYPES);
 	os->os_phys->os_type = type;
 	if (dmu_objset_userused_enabled(os)) {
 		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 		os->os_flags = os->os_phys->os_flags;
 	}
 
 	dsl_dataset_dirty(ds, tx);
 
 	return (os);
 }
 
 typedef struct dmu_objset_create_arg {
 	const char *doca_name;
 	cred_t *doca_cred;
 	void (*doca_userfunc)(objset_t *os, void *arg,
 	    cred_t *cr, dmu_tx_t *tx);
 	void *doca_userarg;
 	dmu_objset_type_t doca_type;
 	uint64_t doca_flags;
 } dmu_objset_create_arg_t;
 
 /*ARGSUSED*/
 static int
 dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_create_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	const char *tail;
 	int error;
 
 	if (strchr(doca->doca_name, '@') != NULL)
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
 	if (tail == NULL) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 	    doca->doca_cred);
 	dsl_dir_rele(pdd, FTAG);
 
 	return (error);
 }
 
 static void
 dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_create_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	const char *tail;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 	blkptr_t *bp;
 	objset_t *os;
 
 	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 
 	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
 	    doca->doca_cred, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 	bp = dsl_dataset_get_blkptr(ds);
 	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
 	    ds, bp, doca->doca_type, tx);
 
 	if (doca->doca_userfunc != NULL) {
 		doca->doca_userfunc(os, doca->doca_userarg,
 		    doca->doca_cred, tx);
 	}
 
 	spa_history_log_internal_ds(ds, "create", tx, "");
 	dsl_dataset_rele(ds, FTAG);
 	dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
 	dmu_objset_create_arg_t doca;
 
 	doca.doca_name = name;
 	doca.doca_cred = CRED();
 	doca.doca_flags = flags;
 	doca.doca_userfunc = func;
 	doca.doca_userarg = arg;
 	doca.doca_type = type;
 
 	return (dsl_sync_task(name,
 	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
 	    5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 typedef struct dmu_objset_clone_arg {
 	const char *doca_clone;
 	const char *doca_origin;
 	cred_t *doca_cred;
 } dmu_objset_clone_arg_t;
 
 /*ARGSUSED*/
 static int
 dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_clone_arg_t *doca = arg;
 	dsl_dir_t *pdd;
 	const char *tail;
 	int error;
 	dsl_dataset_t *origin;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	if (strchr(doca->doca_clone, '@') != NULL)
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
 	if (tail == NULL) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
 	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 	    doca->doca_cred);
 	if (error != 0) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 	dsl_dir_rele(pdd, FTAG);
 
 	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
 	if (error != 0)
 		return (error);
 
 	/* You can only clone snapshots, not the head datasets. */
 	if (!origin->ds_is_snapshot) {
 		dsl_dataset_rele(origin, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	dsl_dataset_rele(origin, FTAG);
 
 	return (0);
 }
 
 static void
 dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_clone_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	const char *tail;
 	dsl_dataset_t *origin, *ds;
 	uint64_t obj;
 	char namebuf[MAXNAMELEN];
 
 	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
 	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
 
 	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
 	    doca->doca_cred, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 	dsl_dataset_name(origin, namebuf);
 	spa_history_log_internal_ds(ds, "clone", tx,
 	    "origin=%s (%llu)", namebuf, origin->ds_object);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_dataset_rele(origin, FTAG);
 	dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_clone(const char *clone, const char *origin)
 {
 	dmu_objset_clone_arg_t doca;
 
 	doca.doca_clone = clone;
 	doca.doca_origin = origin;
 	doca.doca_cred = CRED();
 
 	return (dsl_sync_task(clone,
 	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
 	    5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
 dmu_objset_snapshot_one(const char *fsname, const char *snapname)
 {
 	int err;
 	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
 	nvlist_t *snaps = fnvlist_alloc();
 
 	fnvlist_add_boolean(snaps, longsnap);
 	strfree(longsnap);
 	err = dsl_dataset_snapshot(snaps, NULL, NULL);
 	fnvlist_free(snaps);
 	return (err);
 }
 
 static void
 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	while (dn = list_head(list)) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		ASSERT(dn->dn_dbuf->db_data_pending);
 		/*
 		 * Initialize dn_zio outside dnode_sync() because the
 		 * meta-dnode needs to set it ouside dnode_sync().
 		 */
 		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
 		ASSERT(dn->dn_zio);
 
 		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
 		list_remove(list, dn);
 
 		if (newlist) {
 			(void) dnode_add_ref(dn, newlist);
 			list_insert_tail(newlist, dn);
 		}
 
 		dnode_sync(dn, tx);
 	}
 }
 
 /* ARGSUSED */
 static void
 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 	ASSERT3P(bp, ==, os->os_rootbp);
 	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
 	ASSERT0(BP_GET_LEVEL(bp));
 
 	/*
 	 * Update rootbp fill count: it should be the number of objects
 	 * allocated in the object set (not counting the "special"
 	 * objects that are stored in the objset_phys_t -- the meta
 	 * dnode and user/group accounting objects).
 	 */
 	bp->blk_fill = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 /* ARGSUSED */
 static void
 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	objset_t *os = arg;
 
 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		dmu_tx_t *tx = os->os_synctx;
 
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 }
 
 /* called from dsl */
 void
 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	list_t *list;
 	list_t *newlist = NULL;
 	dbuf_dirty_record_t *dr;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* XXX the write_done callback should really give us the tx... */
 	os->os_synctx = tx;
 
 	if (os->os_dsl_dataset == NULL) {
 		/*
 		 * This is the MOS.  If we have upgraded,
 		 * spa_max_replication() could change, so reset
 		 * os_copies here.
 		 */
 		os->os_copies = spa_max_replication(os->os_spa);
 	}
 
 	/*
 	 * Create the root block IO
 	 */
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
 
 	dmu_write_policy(os, NULL, 0, 0, &zp);
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
 	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
 	    DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
 	    NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
 	    ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
 	 * Sync special dnodes - the parent IO for the sync is the root block
 	 */
 	DMU_META_DNODE(os)->dn_zio = zio;
 	dnode_sync(DMU_META_DNODE(os), tx);
 
 	os->os_phys->os_flags = os->os_flags;
 
 	if (DMU_USERUSED_DNODE(os) &&
 	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
 		DMU_USERUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_USERUSED_DNODE(os), tx);
 		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
 	}
 
 	txgoff = tx->tx_txg & TXG_MASK;
 
 	if (dmu_objset_userused_enabled(os)) {
 		newlist = &os->os_synced_dnodes;
 		/*
 		 * We must create the list here because it uses the
 		 * dn_dirty_link[] of this txg.
 		 */
 		list_create(newlist, sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[txgoff]));
 	}
 
 	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
 	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
 
 	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
 	while (dr = list_head(list)) {
 		ASSERT0(dr->dr_dbuf->db_level);
 		list_remove(list, dr);
 		if (dr->dr_zio)
 			zio_nowait(dr->dr_zio);
 	}
 	/*
 	 * Free intent log blocks up to this tx.
 	 */
 	zil_sync(os->os_zil, tx);
 	os->os_phys->os_zil_header = os->os_zil_header;
 	zio_nowait(zio);
 }
 
 boolean_t
 dmu_objset_is_dirty(objset_t *os, uint64_t txg)
 {
 	return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
 	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
 }
 
 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
 
 void
 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
 {
 	used_cbs[ost] = cb;
 }
 
 boolean_t
 dmu_objset_userused_enabled(objset_t *os)
 {
 	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
 	    used_cbs[os->os_phys->os_type] != NULL &&
 	    DMU_USERUSED_DNODE(os) != NULL);
 }
 
 static void
 do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
     uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
 {
 	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
 		int64_t delta = DNODE_SIZE + used;
 		if (subtract)
 			delta = -delta;
 		VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
 		    user, delta, tx));
 		VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
 		    group, delta, tx));
 	}
 }
 
 void
 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	list_t *list = &os->os_synced_dnodes;
 
 	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
 	while (dn = list_head(list)) {
 		int flags;
 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
 		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
 		    dn->dn_phys->dn_flags &
 		    DNODE_FLAG_USERUSED_ACCOUNTED);
 
 		/* Allocate the user/groupused objects if necessary. */
 		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
 			VERIFY(0 == zap_create_claim(os,
 			    DMU_USERUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 			VERIFY(0 == zap_create_claim(os,
 			    DMU_GROUPUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 		}
 
 		/*
 		 * We intentionally modify the zap object even if the
 		 * net delta is zero.  Otherwise
 		 * the block of the zap obj could be shared between
 		 * datasets but need to be different between them after
 		 * a bprewrite.
 		 */
 
 		flags = dn->dn_id_flags;
 		ASSERT(flags);
 		if (flags & DN_ID_OLD_EXIST)  {
 			do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
 			    dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
 		}
 		if (flags & DN_ID_NEW_EXIST) {
 			do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
 			    dn->dn_phys->dn_flags,  dn->dn_newuid,
 			    dn->dn_newgid, B_FALSE, tx);
 		}
 
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_oldused = 0;
 		dn->dn_oldflags = 0;
 		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
 			dn->dn_olduid = dn->dn_newuid;
 			dn->dn_oldgid = dn->dn_newgid;
 			dn->dn_id_flags |= DN_ID_OLD_EXIST;
 			if (dn->dn_bonuslen == 0)
 				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 			else
 				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		}
 		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
 		mutex_exit(&dn->dn_mtx);
 
 		list_remove(list, dn);
 		dnode_rele(dn, list);
 	}
 }
 
 /*
  * Returns a pointer to data to find uid/gid from
  *
  * If a dirty record for transaction group that is syncing can't
  * be found then NULL is returned.  In the NULL case it is assumed
  * the uid/gid aren't changing.
  */
 static void *
 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr, **drp;
 	void *data;
 
 	if (db->db_dirtycnt == 0)
 		return (db->db.db_data);  /* Nothing is changing */
 
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg == tx->tx_txg)
 			break;
 
 	if (dr == NULL) {
 		data = NULL;
 	} else {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(dr->dr_dbuf);
 		dn = DB_DNODE(dr->dr_dbuf);
 
 		if (dn->dn_bonuslen == 0 &&
 		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
 			data = dr->dt.dl.dr_data->b_data;
 		else
 			data = dr->dt.dl.dr_data;
 
 		DB_DNODE_EXIT(dr->dr_dbuf);
 	}
 
 	return (data);
 }
 
 void
 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	void *data = NULL;
 	dmu_buf_impl_t *db = NULL;
 	uint64_t *user = NULL;
 	uint64_t *group = NULL;
 	int flags = dn->dn_id_flags;
 	int error;
 	boolean_t have_spill = B_FALSE;
 
 	if (!dmu_objset_userused_enabled(dn->dn_objset))
 		return;
 
 	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
 	    DN_ID_CHKED_SPILL)))
 		return;
 
 	if (before && dn->dn_bonuslen != 0)
 		data = DN_BONUS(dn->dn_phys);
 	else if (!before && dn->dn_bonuslen != 0) {
 		if (dn->dn_bonus) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 			data = dmu_objset_userquota_find_data(db, tx);
 		} else {
 			data = DN_BONUS(dn->dn_phys);
 		}
 	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
 			int rf = 0;
 
 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
 				rf |= DB_RF_HAVESTRUCT;
 			error = dmu_spill_hold_by_dnode(dn,
 			    rf | DB_RF_MUST_SUCCEED,
 			    FTAG, (dmu_buf_t **)&db);
 			ASSERT(error == 0);
 			mutex_enter(&db->db_mtx);
 			data = (before) ? db->db.db_data :
 			    dmu_objset_userquota_find_data(db, tx);
 			have_spill = B_TRUE;
 	} else {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		mutex_exit(&dn->dn_mtx);
 		return;
 	}
 
 	if (before) {
 		ASSERT(data);
 		user = &dn->dn_olduid;
 		group = &dn->dn_oldgid;
 	} else if (data) {
 		user = &dn->dn_newuid;
 		group = &dn->dn_newgid;
 	}
 
 	/*
 	 * Must always call the callback in case the object
 	 * type has changed and that type isn't an object type to track
 	 */
 	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
 	    user, group);
 
 	/*
 	 * Preserve existing uid/gid when the callback can't determine
 	 * what the new uid/gid are and the callback returned EEXIST.
 	 * The EEXIST error tells us to just use the existing uid/gid.
 	 * If we don't know what the old values are then just assign
 	 * them to 0, since that is a new file  being created.
 	 */
 	if (!before && data == NULL && error == EEXIST) {
 		if (flags & DN_ID_OLD_EXIST) {
 			dn->dn_newuid = dn->dn_olduid;
 			dn->dn_newgid = dn->dn_oldgid;
 		} else {
 			dn->dn_newuid = 0;
 			dn->dn_newgid = 0;
 		}
 		error = 0;
 	}
 
 	if (db)
 		mutex_exit(&db->db_mtx);
 
 	mutex_enter(&dn->dn_mtx);
 	if (error == 0 && before)
 		dn->dn_id_flags |= DN_ID_OLD_EXIST;
 	if (error == 0 && !before)
 		dn->dn_id_flags |= DN_ID_NEW_EXIST;
 
 	if (have_spill) {
 		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 	} else {
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 	}
 	mutex_exit(&dn->dn_mtx);
 	if (have_spill)
 		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 }
 
 boolean_t
 dmu_objset_userspace_present(objset_t *os)
 {
 	return (os->os_phys->os_flags &
 	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 }
 
 int
 dmu_objset_userspace_upgrade(objset_t *os)
 {
 	uint64_t obj;
 	int err = 0;
 
 	if (dmu_objset_userspace_present(os))
 		return (0);
 	if (!dmu_objset_userused_enabled(os))
 		return (SET_ERROR(ENOTSUP));
 	if (dmu_objset_is_snapshot(os))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * We simply need to mark every object dirty, so that it will be
 	 * synced out and now accounted.  If this is called
 	 * concurrently, or if we already did some work before crashing,
 	 * that's fine, since we track each object's accounted state
 	 * independently.
 	 */
 
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
 		dmu_tx_t *tx;
 		dmu_buf_t *db;
 		int objerr;
 
 		if (issig(JUSTLOOKING) && issig(FORREAL))
 			return (SET_ERROR(EINTR));
 
 		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
 		if (objerr != 0)
 			continue;
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, obj);
 		objerr = dmu_tx_assign(tx, TXG_WAIT);
 		if (objerr != 0) {
 			dmu_tx_abort(tx);
 			continue;
 		}
 		dmu_buf_will_dirty(db, tx);
 		dmu_buf_rele(db, FTAG);
 		dmu_tx_commit(tx);
 	}
 
 	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 	txg_wait_synced(dmu_objset_pool(os), 0);
 	return (0);
 }
 
 void
 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
 	    usedobjsp, availobjsp);
 }
 
 uint64_t
 dmu_objset_fsid_guid(objset_t *os)
 {
 	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
 }
 
 void
 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
 {
 	stat->dds_type = os->os_phys->os_type;
 	if (os->os_dsl_dataset)
 		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
 }
 
 void
 dmu_objset_stats(objset_t *os, nvlist_t *nv)
 {
 	ASSERT(os->os_dsl_dataset ||
 	    os->os_phys->os_type == DMU_OST_META);
 
 	if (os->os_dsl_dataset != NULL)
 		dsl_dataset_stats(os->os_dsl_dataset, nv);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
 	    os->os_phys->os_type);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
 	    dmu_objset_userspace_present(os));
 }
 
 int
 dmu_objset_is_snapshot(objset_t *os)
 {
 	if (os->os_dsl_dataset != NULL)
 		return (os->os_dsl_dataset->ds_is_snapshot);
 	else
 		return (B_FALSE);
 }
 
 int
 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
     boolean_t *conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	uint64_t ignored;
 
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
 	    MT_FIRST, real, maxlen, conflict));
 }
 
 int
 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
 	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
 
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    ds->ds_dir->dd_pool->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strcpy(name, attr.za_name);
 	if (idp)
 		*idp = attr.za_first_integer;
 	if (case_conflict)
 		*case_conflict = attr.za_normalization_conflict;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 
 	return (0);
 }
 
 int
 dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp)
 {
 	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
 	/* there is no next dir on a snapshot! */
 	if (os->os_dsl_dataset->ds_object !=
 	    dsl_dir_phys(dd)->dd_head_dataset_obj)
 		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    dd->dd_pool->dp_meta_objset,
 	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strcpy(name, attr.za_name);
 	if (idp)
 		*idp = attr.za_first_integer;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 
 	return (0);
 }
 
 typedef struct dmu_objset_find_ctx {
 	taskq_t		*dc_tq;
 	dsl_pool_t	*dc_dp;
 	uint64_t	dc_ddobj;
 	int		(*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
 	void		*dc_arg;
 	int		dc_flags;
 	kmutex_t	*dc_error_lock;
 	int		*dc_error;
 } dmu_objset_find_ctx_t;
 
 static void
 dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 {
 	dsl_pool_t *dp = dcp->dc_dp;
 	dmu_objset_find_ctx_t *child_dcp;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	uint64_t thisobj;
 	int err = 0;
 
 	/* don't process if there already was an error */
 	if (*dcp->dc_error != 0)
 		goto out;
 
 	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
 	if (err != 0)
 		goto out;
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
 		goto out;
 	}
 
 	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (dcp->dc_flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
 			child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
 			*child_dcp = *dcp;
 			child_dcp->dc_ddobj = attr->za_first_integer;
 			if (dcp->dc_tq != NULL)
 				(void) taskq_dispatch(dcp->dc_tq,
 				    dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
 			else
 				dmu_objset_find_dp_impl(child_dcp);
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
 	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
 		dsl_dataset_t *ds;
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
 			uint64_t snapobj;
 
 			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
 				ASSERT3U(attr->za_num_integers, ==, 1);
 
 				err = dsl_dataset_hold_obj(dp,
 				    attr->za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
 				err = dcp->dc_func(dp, ds, dcp->dc_arg);
 				dsl_dataset_rele(ds, FTAG);
 				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
 	dsl_dir_rele(dd, FTAG);
 	kmem_free(attr, sizeof (zap_attribute_t));
 
 	if (err != 0)
 		goto out;
 
 	/*
 	 * Apply to self.
 	 */
 	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 	if (err != 0)
 		goto out;
 	err = dcp->dc_func(dp, ds, dcp->dc_arg);
 	dsl_dataset_rele(ds, FTAG);
 
 out:
 	if (err != 0) {
 		mutex_enter(dcp->dc_error_lock);
 		/* only keep first error */
 		if (*dcp->dc_error == 0)
 			*dcp->dc_error = err;
 		mutex_exit(dcp->dc_error_lock);
 	}
 
 	kmem_free(dcp, sizeof (*dcp));
 }
 
 static void
 dmu_objset_find_dp_cb(void *arg)
 {
 	dmu_objset_find_ctx_t *dcp = arg;
 	dsl_pool_t *dp = dcp->dc_dp;
 
 	/*
 	 * We need to get a pool_config_lock here, as there are several
 	 * asssert(pool_config_held) down the stack. Getting a lock via
 	 * dsl_pool_config_enter is risky, as it might be stalled by a
 	 * pending writer. This would deadlock, as the write lock can
 	 * only be granted when our parent thread gives up the lock.
 	 * The _prio interface gives us priority over a pending writer.
 	 */
 	dsl_pool_config_enter_prio(dp, FTAG);
 
 	dmu_objset_find_dp_impl(dcp);
 
 	dsl_pool_config_exit(dp, FTAG);
 }
 
 /*
  * Find objsets under and including ddobj, call func(ds) on each.
  * The order for the enumeration is completely undefined.
  * func is called with dsl_pool_config held.
  */
 int
 dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
     int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
 {
 	int error = 0;
 	taskq_t *tq = NULL;
 	int ntasks;
 	dmu_objset_find_ctx_t *dcp;
 	kmutex_t err_lock;
 
 	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
 	dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
 	dcp->dc_tq = NULL;
 	dcp->dc_dp = dp;
 	dcp->dc_ddobj = ddobj;
 	dcp->dc_func = func;
 	dcp->dc_arg = arg;
 	dcp->dc_flags = flags;
 	dcp->dc_error_lock = &err_lock;
 	dcp->dc_error = &error;
 
 	if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
 		/*
 		 * In case a write lock is held we can't make use of
 		 * parallelism, as down the stack of the worker threads
 		 * the lock is asserted via dsl_pool_config_held.
 		 * In case of a read lock this is solved by getting a read
 		 * lock in each worker thread, which isn't possible in case
 		 * of a writer lock. So we fall back to the synchronous path
 		 * here.
 		 * In the future it might be possible to get some magic into
 		 * dsl_pool_config_held in a way that it returns true for
 		 * the worker threads so that a single lock held from this
 		 * thread suffices. For now, stay single threaded.
 		 */
 		dmu_objset_find_dp_impl(dcp);
 		mutex_destroy(&err_lock);
 
 		return (error);
 	}
 
 	ntasks = dmu_find_threads;
 	if (ntasks == 0)
 		ntasks = vdev_count_leaves(dp->dp_spa) * 4;
 	tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
 	    INT_MAX, 0);
 	if (tq == NULL) {
 		kmem_free(dcp, sizeof (*dcp));
 		mutex_destroy(&err_lock);
 
 		return (SET_ERROR(ENOMEM));
 	}
 	dcp->dc_tq = tq;
 
 	/* dcp will be freed by task */
 	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
 
 	/*
 	 * PORTING: this code relies on the property of taskq_wait to wait
 	 * until no more tasks are queued and no more tasks are active. As
 	 * we always queue new tasks from within other tasks, task_wait
 	 * reliably waits for the full recursion to finish, even though we
 	 * enqueue new tasks after taskq_wait has been called.
 	 * On platforms other than illumos, taskq_wait may not have this
 	 * property.
 	 */
 	taskq_wait(tq);
 	taskq_destroy(tq);
 	mutex_destroy(&err_lock);
 
 	return (error);
 }
 
 /*
  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
  * The dp_config_rwlock must not be held when this is called, and it
  * will not be held when the callback is called.
  * Therefore this function should only be used when the pool is not changing
  * (e.g. in syncing context), or the callback can deal with the possible races.
  */
 static int
 dmu_objset_find_impl(spa_t *spa, const char *name,
     int func(const char *, void *), void *arg, int flags)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	char *child;
 	uint64_t thisobj;
 	int err;
 
 	dsl_pool_config_enter(dp, FTAG);
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
 	if (err != 0) {
 		dsl_pool_config_exit(dp, FTAG);
 		return (err);
 	}
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 		return (0);
 	}
 
 	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
 			child = kmem_asprintf("%s/%s", name, attr->za_name);
 			dsl_pool_config_exit(dp, FTAG);
 			err = dmu_objset_find_impl(spa, child,
 			    func, arg, flags);
 			dsl_pool_config_enter(dp, FTAG);
 			strfree(child);
 			if (err != 0)
 				break;
 		}
 		zap_cursor_fini(&zc);
 
 		if (err != 0) {
 			dsl_dir_rele(dd, FTAG);
 			dsl_pool_config_exit(dp, FTAG);
 			kmem_free(attr, sizeof (zap_attribute_t));
 			return (err);
 		}
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
 	if (flags & DS_FIND_SNAPSHOTS) {
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
 			uint64_t snapobj;
 
 			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
 				ASSERT3U(attr->za_num_integers, ==, 1);
 
 				child = kmem_asprintf("%s@%s",
 				    name, attr->za_name);
 				dsl_pool_config_exit(dp, FTAG);
 				err = func(child, arg);
 				dsl_pool_config_enter(dp, FTAG);
 				strfree(child);
 				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
 	dsl_dir_rele(dd, FTAG);
 	kmem_free(attr, sizeof (zap_attribute_t));
 	dsl_pool_config_exit(dp, FTAG);
 
 	if (err != 0)
 		return (err);
 
 	/* Apply to self. */
 	return (func(name, arg));
 }
 
 /*
  * See comment above dmu_objset_find_impl().
  */
 int
 dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	error = dmu_objset_find_impl(spa, name, func, arg, flags);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 void
 dmu_objset_set_user(objset_t *os, void *user_ptr)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	os->os_user_ptr = user_ptr;
 }
 
 void *
 dmu_objset_get_user(objset_t *os)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	return (os->os_user_ptr);
 }
 
 /*
  * Determine name of filesystem, given name of snapshot.
  * buf must be at least MAXNAMELEN bytes
  */
 int
 dmu_fsname(const char *snapname, char *buf)
 {
 	char *atp = strchr(snapname, '@');
 	if (atp == NULL)
 		return (SET_ERROR(EINVAL));
 	if (atp - snapname >= MAXNAMELEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	(void) strlcpy(buf, snapname, atp - snapname + 1);
 	return (0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	(revision 296519)
@@ -1,3227 +1,3228 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
 #include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/zfeature.h>
 #include <sys/bqueue.h>
 
 #ifdef __FreeBSD__
 #undef dump_write
 #define dump_write dmu_dump_write
 #endif
 
 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
 int zfs_send_corrupt_data = B_FALSE;
 int zfs_send_queue_length = 16 * 1024 * 1024;
 int zfs_recv_queue_length = 16 * 1024 * 1024;
 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
 int zfs_send_set_freerecords_bit = B_TRUE;
 
 #ifdef _KERNEL
 TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit);
 #endif
 
 static char *dmu_recv_tag = "dmu_recv_tag";
 const char *recv_clone_name = "%recv";
 
 #define	BP_SPAN(datablkszsec, indblkshift, level) \
 	(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
 	(level) * (indblkshift - SPA_BLKPTRSHIFT)))
 
 static void byteswap_record(dmu_replay_record_t *drr);
 
 struct send_thread_arg {
 	bqueue_t	q;
 	dsl_dataset_t	*ds;		/* Dataset to traverse */
 	uint64_t	fromtxg;	/* Traverse from this txg */
 	int		flags;		/* flags to pass to traverse_dataset */
 	int		error_code;
 	boolean_t	cancel;
 	zbookmark_phys_t resume;
 };
 
 struct send_block_record {
 	boolean_t		eos_marker; /* Marks the end of the stream */
 	blkptr_t		bp;
 	zbookmark_phys_t	zb;
 	uint8_t			indblkshift;
 	uint16_t		datablkszsec;
 	bqueue_node_t		ln;
 };
 
 static int
 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
 {
 	dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
 	struct uio auio;
 	struct iovec aiov;
 	ASSERT0(len % 8);
 
 	aiov.iov_base = buf;
 	aiov.iov_len = len;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = len;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_offset = (off_t)-1;
 	auio.uio_td = dsp->dsa_td;
 #ifdef _KERNEL
 	if (dsp->dsa_fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 	dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0,
 	    dsp->dsa_td);
 #else
 	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
 	dsp->dsa_err = EOPNOTSUPP;
 #endif
 	mutex_enter(&ds->ds_sendstream_lock);
 	*dsp->dsa_off += len;
 	mutex_exit(&ds->ds_sendstream_lock);
 
 	return (dsp->dsa_err);
 }
 
 /*
  * For all record types except BEGIN, fill in the checksum (overlaid in
  * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
  * up to the start of the checksum itself.
  */
 static int
 dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
 {
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	fletcher_4_incremental_native(dsp->dsa_drr,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    &dsp->dsa_zc);
 	if (dsp->dsa_drr->drr_type != DRR_BEGIN) {
 		ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
 		    drr_checksum.drr_checksum));
 		dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
 	}
 	fletcher_4_incremental_native(&dsp->dsa_drr->
 	    drr_u.drr_checksum.drr_checksum,
 	    sizeof (zio_cksum_t), &dsp->dsa_zc);
 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 		return (SET_ERROR(EINTR));
 	if (payload_len != 0) {
 		fletcher_4_incremental_native(payload, payload_len,
 		    &dsp->dsa_zc);
 		if (dump_bytes(dsp, payload, payload_len) != 0)
 			return (SET_ERROR(EINTR));
 	}
 	return (0);
 }
 
 /*
  * Fill in the drr_free struct, or perform aggregation if the previous record is
  * also a free record, and the two are adjacent.
  *
  * Note that we send free records even for a full send, because we want to be
  * able to receive a full send as a clone, which requires a list of all the free
  * and freeobject records that were generated on the source.
  */
 static int
 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
     uint64_t length)
 {
 	struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
 
 	/*
 	 * When we receive a free record, dbuf_free_range() assumes
 	 * that the receiving system doesn't have any dbufs in the range
 	 * being freed.  This is always true because there is a one-record
 	 * constraint: we only send one WRITE record for any given
 	 * object,offset.  We know that the one-record constraint is
 	 * true because we always send data in increasing order by
 	 * object,offset.
 	 *
 	 * If the increasing-order constraint ever changes, we should find
 	 * another way to assert that the one-record constraint is still
 	 * satisfied.
 	 */
 	ASSERT(object > dsp->dsa_last_data_object ||
 	    (object == dsp->dsa_last_data_object &&
 	    offset > dsp->dsa_last_data_offset));
 
 	if (length != -1ULL && offset + length < offset)
 		length = -1ULL;
 
 	/*
 	 * If there is a pending op, but it's not PENDING_FREE, push it out,
 	 * since free block aggregation can only be done for blocks of the
 	 * same type (i.e., DRR_FREE records can only be aggregated with
 	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
 	 * aggregated with other DRR_FREEOBJECTS records.
 	 */
 	if (dsp->dsa_pending_op != PENDING_NONE &&
 	    dsp->dsa_pending_op != PENDING_FREE) {
 		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
 	if (dsp->dsa_pending_op == PENDING_FREE) {
 		/*
 		 * There should never be a PENDING_FREE if length is -1
 		 * (because dump_dnode is the only place where this
 		 * function is called with a -1, and only after flushing
 		 * any pending record).
 		 */
 		ASSERT(length != -1ULL);
 		/*
 		 * Check to see whether this free block can be aggregated
 		 * with pending one.
 		 */
 		if (drrf->drr_object == object && drrf->drr_offset +
 		    drrf->drr_length == offset) {
 			drrf->drr_length += length;
 			return (0);
 		} else {
 			/* not a continuation.  Push out pending record */
 			if (dump_record(dsp, NULL, 0) != 0)
 				return (SET_ERROR(EINTR));
 			dsp->dsa_pending_op = PENDING_NONE;
 		}
 	}
 	/* create a FREE record and make it pending */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_FREE;
 	drrf->drr_object = object;
 	drrf->drr_offset = offset;
 	drrf->drr_length = length;
 	drrf->drr_toguid = dsp->dsa_toguid;
 	if (length == -1ULL) {
 		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 	} else {
 		dsp->dsa_pending_op = PENDING_FREE;
 	}
 
 	return (0);
 }
 
 static int
 dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
     uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 {
 	struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
 
 	/*
 	 * We send data in increasing object, offset order.
 	 * See comment in dump_free() for details.
 	 */
 	ASSERT(object > dsp->dsa_last_data_object ||
 	    (object == dsp->dsa_last_data_object &&
 	    offset > dsp->dsa_last_data_offset));
 	dsp->dsa_last_data_object = object;
 	dsp->dsa_last_data_offset = offset + blksz - 1;
 
 	/*
 	 * If there is any kind of pending aggregation (currently either
 	 * a grouping of free objects or free blocks), push it out to
 	 * the stream, since aggregation can't be done across operations
 	 * of different types.
 	 */
 	if (dsp->dsa_pending_op != PENDING_NONE) {
 		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 	/* write a WRITE record */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_WRITE;
 	drrw->drr_object = object;
 	drrw->drr_type = type;
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = dsp->dsa_toguid;
 	if (bp == NULL || BP_IS_EMBEDDED(bp)) {
 		/*
 		 * There's no pre-computed checksum for partial-block
 		 * writes or embedded BP's, so (like
 		 * fletcher4-checkummed blocks) userland will have to
 		 * compute a dedup-capable checksum itself.
 		 */
 		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
 	} else {
 		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
 		if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)
 			drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
 		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
 		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
 		DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
 		drrw->drr_key.ddk_cksum = bp->blk_cksum;
 	}
 
 	if (dump_record(dsp, data, blksz) != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static int
 dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
     int blksz, const blkptr_t *bp)
 {
 	char buf[BPE_PAYLOAD_SIZE];
 	struct drr_write_embedded *drrw =
 	    &(dsp->dsa_drr->drr_u.drr_write_embedded);
 
 	if (dsp->dsa_pending_op != PENDING_NONE) {
 		if (dump_record(dsp, NULL, 0) != 0)
 			return (EINTR);
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
 	ASSERT(BP_IS_EMBEDDED(bp));
 
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
 	drrw->drr_object = object;
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = dsp->dsa_toguid;
 	drrw->drr_compression = BP_GET_COMPRESS(bp);
 	drrw->drr_etype = BPE_GET_ETYPE(bp);
 	drrw->drr_lsize = BPE_GET_LSIZE(bp);
 	drrw->drr_psize = BPE_GET_PSIZE(bp);
 
 	decode_embedded_bp_compressed(bp, buf);
 
 	if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
 		return (EINTR);
 	return (0);
 }
 
 static int
 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
 {
 	struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
 
 	if (dsp->dsa_pending_op != PENDING_NONE) {
 		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
 	/* write a SPILL record */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_SPILL;
 	drrs->drr_object = object;
 	drrs->drr_length = blksz;
 	drrs->drr_toguid = dsp->dsa_toguid;
 
 	if (dump_record(dsp, data, blksz) != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static int
 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
 {
 	struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
 
 	/*
 	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
 	 * push it out, since free block aggregation can only be done for
 	 * blocks of the same type (i.e., DRR_FREE records can only be
 	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
 	 * can only be aggregated with other DRR_FREEOBJECTS records.
 	 */
 	if (dsp->dsa_pending_op != PENDING_NONE &&
 	    dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
 		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 	if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
 		/*
 		 * See whether this free object array can be aggregated
 		 * with pending one
 		 */
 		if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
 			drrfo->drr_numobjs += numobjs;
 			return (0);
 		} else {
 			/* can't be aggregated.  Push out pending record */
 			if (dump_record(dsp, NULL, 0) != 0)
 				return (SET_ERROR(EINTR));
 			dsp->dsa_pending_op = PENDING_NONE;
 		}
 	}
 
 	/* write a FREEOBJECTS record */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
 	drrfo->drr_firstobj = firstobj;
 	drrfo->drr_numobjs = numobjs;
 	drrfo->drr_toguid = dsp->dsa_toguid;
 
 	dsp->dsa_pending_op = PENDING_FREEOBJECTS;
 
 	return (0);
 }
 
 static int
 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
 {
 	struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
 
 	if (object < dsp->dsa_resume_object) {
 		/*
 		 * Note: when resuming, we will visit all the dnodes in
 		 * the block of dnodes that we are resuming from.  In
 		 * this case it's unnecessary to send the dnodes prior to
 		 * the one we are resuming from.  We should be at most one
 		 * block's worth of dnodes behind the resume point.
 		 */
 		ASSERT3U(dsp->dsa_resume_object - object, <,
 		    1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
 		return (0);
 	}
 
 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 		return (dump_freeobjects(dsp, object, 1));
 
 	if (dsp->dsa_pending_op != PENDING_NONE) {
 		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
 	/* write an OBJECT record */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_OBJECT;
 	drro->drr_object = object;
 	drro->drr_type = dnp->dn_type;
 	drro->drr_bonustype = dnp->dn_bonustype;
 	drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	drro->drr_bonuslen = dnp->dn_bonuslen;
 	drro->drr_checksumtype = dnp->dn_checksum;
 	drro->drr_compress = dnp->dn_compress;
 	drro->drr_toguid = dsp->dsa_toguid;
 
 	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
 		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
 
 	if (dump_record(dsp, DN_BONUS(dnp),
 	    P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
 		return (SET_ERROR(EINTR));
 	}
 
 	/* Free anything past the end of the file. */
 	if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
 		return (SET_ERROR(EINTR));
 	if (dsp->dsa_err != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static boolean_t
 backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
 {
 	if (!BP_IS_EMBEDDED(bp))
 		return (B_FALSE);
 
 	/*
 	 * Compression function must be legacy, or explicitly enabled.
 	 */
 	if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
 	    !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
 		return (B_FALSE);
 
 	/*
 	 * Embed type must be explicitly enabled.
 	 */
 	switch (BPE_GET_ETYPE(bp)) {
 	case BP_EMBEDDED_TYPE_DATA:
 		if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
 			return (B_TRUE);
 		break;
 	default:
 		return (B_FALSE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * This is the callback function to traverse_dataset that acts as the worker
  * thread for dmu_send_impl.
  */
 /*ARGSUSED*/
 static int
 send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
 {
 	struct send_thread_arg *sta = arg;
 	struct send_block_record *record;
 	uint64_t record_size;
 	int err = 0;
 
 	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
 	    zb->zb_object >= sta->resume.zb_object);
 
 	if (sta->cancel)
 		return (SET_ERROR(EINTR));
 
 	if (bp == NULL) {
 		ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
 		return (0);
 	} else if (zb->zb_level < 0) {
 		return (0);
 	}
 
 	record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
 	record->eos_marker = B_FALSE;
 	record->bp = *bp;
 	record->zb = *zb;
 	record->indblkshift = dnp->dn_indblkshift;
 	record->datablkszsec = dnp->dn_datablkszsec;
 	record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	bqueue_enqueue(&sta->q, record, record_size);
 
 	return (err);
 }
 
 /*
  * This function kicks off the traverse_dataset.  It also handles setting the
  * error code of the thread in case something goes wrong, and pushes the End of
  * Stream record when the traverse_dataset call has finished.  If there is no
  * dataset to traverse, the thread immediately pushes End of Stream marker.
  */
 static void
 send_traverse_thread(void *arg)
 {
 	struct send_thread_arg *st_arg = arg;
 	int err;
 	struct send_block_record *data;
 
 	if (st_arg->ds != NULL) {
 		err = traverse_dataset_resume(st_arg->ds,
 		    st_arg->fromtxg, &st_arg->resume,
 		    st_arg->flags, send_cb, st_arg);
 
 		if (err != EINTR)
 			st_arg->error_code = err;
 	}
 	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
 	data->eos_marker = B_TRUE;
 	bqueue_enqueue(&st_arg->q, data, 1);
 	thread_exit();
 }
 
 /*
  * This function actually handles figuring out what kind of record needs to be
  * dumped, reading the data (which has hopefully been prefetched), and calling
  * the appropriate helper function.
  */
 static int
 do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 {
 	dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
 	const blkptr_t *bp = &data->bp;
 	const zbookmark_phys_t *zb = &data->zb;
 	uint8_t indblkshift = data->indblkshift;
 	uint16_t dblkszsec = data->datablkszsec;
 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
 	int err = 0;
 
 	ASSERT3U(zb->zb_level, >=, 0);
 
 	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
 	    zb->zb_object >= dsa->dsa_resume_object);
 
 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 		return (0);
 	} else if (BP_IS_HOLE(bp) &&
 	    zb->zb_object == DMU_META_DNODE_OBJECT) {
 		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 		err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
 	} else if (BP_IS_HOLE(bp)) {
 		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
 		uint64_t offset = zb->zb_blkid * span;
 		err = dump_free(dsa, zb->zb_object, offset, span);
 	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 		return (0);
 	} else if (type == DMU_OT_DNODE) {
 		int blksz = BP_GET_LSIZE(bp);
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
 
 		ASSERT0(zb->zb_level);
 
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
 		dnode_phys_t *blk = abuf->b_data;
 		uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
 		for (int i = 0; i < blksz >> DNODE_SHIFT; i++) {
 			err = dump_dnode(dsa, dnobj + i, blk + i);
 			if (err != 0)
 				break;
 		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	} else if (type == DMU_OT_SA) {
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
 		int blksz = BP_GET_LSIZE(bp);
 
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
 		err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	} else if (backup_do_embed(dsa, bp)) {
 		/* it's an embedded level-0 block of a regular object */
 		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
 		ASSERT0(zb->zb_level);
 		err = dump_write_embedded(dsa, zb->zb_object,
 		    zb->zb_blkid * blksz, blksz, bp);
 	} else {
 		/* it's a level-0 block of a regular object */
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
 		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
 		uint64_t offset;
 
 		ASSERT0(zb->zb_level);
 		ASSERT(zb->zb_object > dsa->dsa_resume_object ||
 		    (zb->zb_object == dsa->dsa_resume_object &&
 		    zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
 
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0) {
 			if (zfs_send_corrupt_data) {
 				/* Send a block filled with 0x"zfs badd bloc" */
 				abuf = arc_buf_alloc(spa, blksz, &abuf,
 				    ARC_BUFC_DATA);
 				uint64_t *ptr;
 				for (ptr = abuf->b_data;
 				    (char *)ptr < (char *)abuf->b_data + blksz;
 				    ptr++)
 					*ptr = 0x2f5baddb10cULL;
 			} else {
 				return (SET_ERROR(EIO));
 			}
 		}
 
 		offset = zb->zb_blkid * blksz;
 
 		if (!(dsa->dsa_featureflags &
 		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 		    blksz > SPA_OLD_MAXBLOCKSIZE) {
 			char *buf = abuf->b_data;
 			while (blksz > 0 && err == 0) {
 				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
 				err = dump_write(dsa, type, zb->zb_object,
 				    offset, n, NULL, buf);
 				offset += n;
 				buf += n;
 				blksz -= n;
 			}
 		} else {
 			err = dump_write(dsa, type, zb->zb_object,
 			    offset, blksz, bp, abuf->b_data);
 		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	}
 
 	ASSERT(err == 0 || err == EINTR);
 	return (err);
 }
 
 /*
  * Pop the new data off the queue, and free the old data.
  */
 static struct send_block_record *
 get_next_record(bqueue_t *bq, struct send_block_record *data)
 {
 	struct send_block_record *tmp = bqueue_dequeue(bq);
 	kmem_free(data, sizeof (*data));
 	return (tmp);
 }
 
 /*
  * Actually do the bulk of the work in a zfs send.
  *
  * Note: Releases dp using the specified tag.
  */
 static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
     zfs_bookmark_phys_t *ancestor_zb,
     boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd,
     uint64_t resumeobj, uint64_t resumeoff,
 #ifdef illumos
     vnode_t *vp, offset_t *off)
 #else
     struct file *fp, offset_t *off)
 #endif
 {
 	objset_t *os;
 	dmu_replay_record_t *drr;
 	dmu_sendarg_t *dsp;
 	int err;
 	uint64_t fromtxg = 0;
 	uint64_t featureflags = 0;
 	struct send_thread_arg to_arg = { 0 };
 
 	err = dmu_objset_from_ds(to_ds, &os);
 	if (err != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
 	}
 
 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 	drr->drr_type = DRR_BEGIN;
 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
 	DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
 	    DMU_SUBSTREAM);
 
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS) {
 		uint64_t version;
 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
 			kmem_free(drr, sizeof (dmu_replay_record_t));
 			dsl_pool_rele(dp, tag);
 			return (SET_ERROR(EINVAL));
 		}
 		if (version >= ZPL_VERSION_SA) {
 			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
 		}
 	}
 #endif
 
 	if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
 		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
 	if (embedok &&
 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
 		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
 		if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 			featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
 	}
 
 	if (resumeobj != 0 || resumeoff != 0) {
 		featureflags |= DMU_BACKUP_FEATURE_RESUMING;
 	}
 
 	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
 	    featureflags);
 
 	drr->drr_u.drr_begin.drr_creation_time =
 	    dsl_dataset_phys(to_ds)->ds_creation_time;
 	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
 	if (is_clone)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
 	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
 	if (zfs_send_set_freerecords_bit)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
 
 	if (ancestor_zb != NULL) {
 		drr->drr_u.drr_begin.drr_fromguid =
 		    ancestor_zb->zbm_guid;
 		fromtxg = ancestor_zb->zbm_creation_txg;
 	}
 	dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
 	if (!to_ds->ds_is_snapshot) {
 		(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
 		    sizeof (drr->drr_u.drr_begin.drr_toname));
 	}
 
 	dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
 
 	dsp->dsa_drr = drr;
 	dsp->dsa_outfd = outfd;
 	dsp->dsa_proc = curproc;
 	dsp->dsa_td = curthread;
 	dsp->dsa_fp = fp;
 	dsp->dsa_os = os;
 	dsp->dsa_off = off;
 	dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	dsp->dsa_pending_op = PENDING_NONE;
 	dsp->dsa_featureflags = featureflags;
 	dsp->dsa_resume_object = resumeobj;
 	dsp->dsa_resume_offset = resumeoff;
 
 	mutex_enter(&to_ds->ds_sendstream_lock);
 	list_insert_head(&to_ds->ds_sendstreams, dsp);
 	mutex_exit(&to_ds->ds_sendstream_lock);
 
 	dsl_dataset_long_hold(to_ds, FTAG);
 	dsl_pool_rele(dp, tag);
 
 	void *payload = NULL;
 	size_t payload_len = 0;
 	if (resumeobj != 0 || resumeoff != 0) {
 		dmu_object_info_t to_doi;
 		err = dmu_object_info(os, resumeobj, &to_doi);
 		if (err != 0)
 			goto out;
 		SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
 		    resumeoff / to_doi.doi_data_block_size);
 
 		nvlist_t *nvl = fnvlist_alloc();
 		fnvlist_add_uint64(nvl, "resume_object", resumeobj);
 		fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
 		payload = fnvlist_pack(nvl, &payload_len);
 		drr->drr_payloadlen = payload_len;
 		fnvlist_free(nvl);
 	}
 
 	err = dump_record(dsp, payload, payload_len);
 	fnvlist_pack_free(payload, payload_len);
 	if (err != 0) {
 		err = dsp->dsa_err;
 		goto out;
 	}
 
 	err = bqueue_init(&to_arg.q, zfs_send_queue_length,
 	    offsetof(struct send_block_record, ln));
 	to_arg.error_code = 0;
 	to_arg.cancel = B_FALSE;
 	to_arg.ds = to_ds;
 	to_arg.fromtxg = fromtxg;
 	to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
 	(void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0,
 	    TS_RUN, minclsyspri);
 
 	struct send_block_record *to_data;
 	to_data = bqueue_dequeue(&to_arg.q);
 
 	while (!to_data->eos_marker && err == 0) {
 		err = do_dump(dsp, to_data);
 		to_data = get_next_record(&to_arg.q, to_data);
 		if (issig(JUSTLOOKING) && issig(FORREAL))
 			err = EINTR;
 	}
 
 	if (err != 0) {
 		to_arg.cancel = B_TRUE;
 		while (!to_data->eos_marker) {
 			to_data = get_next_record(&to_arg.q, to_data);
 		}
 	}
 	kmem_free(to_data, sizeof (*to_data));
 
 	bqueue_destroy(&to_arg.q);
 
 	if (err == 0 && to_arg.error_code != 0)
 		err = to_arg.error_code;
 
 	if (err != 0)
 		goto out;
 
 	if (dsp->dsa_pending_op != PENDING_NONE)
 		if (dump_record(dsp, NULL, 0) != 0)
 			err = SET_ERROR(EINTR);
 
 	if (err != 0) {
 		if (err == EINTR && dsp->dsa_err != 0)
 			err = dsp->dsa_err;
 		goto out;
 	}
 
 	bzero(drr, sizeof (dmu_replay_record_t));
 	drr->drr_type = DRR_END;
 	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
 	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 
 	if (dump_record(dsp, NULL, 0) != 0)
 		err = dsp->dsa_err;
 
 out:
 	mutex_enter(&to_ds->ds_sendstream_lock);
 	list_remove(&to_ds->ds_sendstreams, dsp);
 	mutex_exit(&to_ds->ds_sendstream_lock);
 
 	kmem_free(drr, sizeof (dmu_replay_record_t));
 	kmem_free(dsp, sizeof (dmu_sendarg_t));
 
 	dsl_dataset_long_rele(to_ds, FTAG);
 
 	return (err);
 }
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
     boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, vnode_t *vp, offset_t *off)
 #else
     int outfd, struct file *fp, offset_t *off)
 #endif
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	dsl_dataset_t *fromds = NULL;
 	int err;
 
 	err = dsl_pool_hold(pool, FTAG, &dp);
 	if (err != 0)
 		return (err);
 
 	err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
 	if (fromsnap != 0) {
 		zfs_bookmark_phys_t zb;
 		boolean_t is_clone;
 
 		err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
 		if (err != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (err);
 		}
 		if (!dsl_dataset_is_before(ds, fromds, 0))
 			err = SET_ERROR(EXDEV);
 		zb.zbm_creation_time =
 		    dsl_dataset_phys(fromds)->ds_creation_time;
 		zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
 		zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
 		is_clone = (fromds->ds_dir != ds->ds_dir);
 		dsl_dataset_rele(fromds, FTAG);
 		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
 		    embedok, large_block_ok, outfd, 0, 0, fp, off);
 	} else {
 		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
 		    embedok, large_block_ok, outfd, 0, 0, fp, off);
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 int
 dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
     boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
 #ifdef illumos
     vnode_t *vp, offset_t *off)
 #else
     struct file *fp, offset_t *off)
 #endif
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 	boolean_t owned = B_FALSE;
 
 	if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
 		return (SET_ERROR(EINVAL));
 
 	err = dsl_pool_hold(tosnap, FTAG, &dp);
 	if (err != 0)
 		return (err);
 
 	if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
 		/*
 		 * We are sending a filesystem or volume.  Ensure
 		 * that it doesn't change by owning the dataset.
 		 */
 		err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
 		owned = B_TRUE;
 	} else {
 		err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
 	}
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
 	if (fromsnap != NULL) {
 		zfs_bookmark_phys_t zb;
 		boolean_t is_clone = B_FALSE;
 		int fsnamelen = strchr(tosnap, '@') - tosnap;
 
 		/*
 		 * If the fromsnap is in a different filesystem, then
 		 * mark the send stream as a clone.
 		 */
 		if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
 		    (fromsnap[fsnamelen] != '@' &&
 		    fromsnap[fsnamelen] != '#')) {
 			is_clone = B_TRUE;
 		}
 
 		if (strchr(fromsnap, '@')) {
 			dsl_dataset_t *fromds;
 			err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
 			if (err == 0) {
 				if (!dsl_dataset_is_before(ds, fromds, 0))
 					err = SET_ERROR(EXDEV);
 				zb.zbm_creation_time =
 				    dsl_dataset_phys(fromds)->ds_creation_time;
 				zb.zbm_creation_txg =
 				    dsl_dataset_phys(fromds)->ds_creation_txg;
 				zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
 				is_clone = (ds->ds_dir != fromds->ds_dir);
 				dsl_dataset_rele(fromds, FTAG);
 			}
 		} else {
 			err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
 		}
 		if (err != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (err);
 		}
 		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
 		    embedok, large_block_ok,
 		    outfd, resumeobj, resumeoff, fp, off);
 	} else {
 		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
 		    embedok, large_block_ok,
 		    outfd, resumeobj, resumeoff, fp, off);
 	}
 	if (owned)
 		dsl_dataset_disown(ds, FTAG);
 	else
 		dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 static int
 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
     uint64_t *sizep)
 {
 	int err;
 	/*
 	 * Assume that space (both on-disk and in-stream) is dominated by
 	 * data.  We will adjust for indirect blocks and the copies property,
 	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
 	 */
 
 	/*
 	 * Subtract out approximate space used by indirect blocks.
 	 * Assume most space is used by data blocks (non-indirect, non-dnode).
 	 * Assume all blocks are recordsize.  Assume ditto blocks and
 	 * internal fragmentation counter out compression.
 	 *
 	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
 	 * block, which we observe in practice.
 	 */
 	uint64_t recordsize;
 	err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
 	if (err != 0)
 		return (err);
 	size -= size / recordsize * sizeof (blkptr_t);
 
 	/* Add in the space for the record associated with each block. */
 	size += size / recordsize * sizeof (dmu_replay_record_t);
 
 	*sizep = size;
 
 	return (0);
 }
 
 int
 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	int err;
 	uint64_t size;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	/* tosnap must be a snapshot */
 	if (!ds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	/* fromsnap, if provided, must be a snapshot */
 	if (fromds != NULL && !fromds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
 	 * or the origin's fs.
 	 */
 	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
 		return (SET_ERROR(EXDEV));
 
 	/* Get uncompressed size estimate of changed data. */
 	if (fromds == NULL) {
 		size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
 	} else {
 		uint64_t used, comp;
 		err = dsl_dataset_space_written(fromds, ds,
 		    &used, &comp, &size);
 		if (err != 0)
 			return (err);
 	}
 
 	err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
 	return (err);
 }
 
 /*
  * Simple callback used to traverse the blocks of a snapshot and sum their
  * uncompressed size
  */
 /* ARGSUSED */
 static int
 dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	uint64_t *spaceptr = arg;
 	if (bp != NULL && !BP_IS_HOLE(bp)) {
 		*spaceptr += BP_GET_UCSIZE(bp);
 	}
 	return (0);
 }
 
 /*
  * Given a desination snapshot and a TXG, calculate the approximate size of a
  * send stream sent from that TXG. from_txg may be zero, indicating that the
  * whole snapshot will be sent.
  */
 int
 dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
     uint64_t *sizep)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	int err;
 	uint64_t size = 0;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	/* tosnap must be a snapshot */
 	if (!dsl_dataset_is_snapshot(ds))
 		return (SET_ERROR(EINVAL));
 
 	/* verify that from_txg is before the provided snapshot was taken */
 	if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
 		return (SET_ERROR(EXDEV));
 	}
 
 	/*
 	 * traverse the blocks of the snapshot with birth times after
 	 * from_txg, summing their uncompressed size
 	 */
 	err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
 	    dmu_calculate_send_traversal, &size);
 	if (err)
 		return (err);
 
 	err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
 	return (err);
 }
 
 typedef struct dmu_recv_begin_arg {
 	const char *drba_origin;
 	dmu_recv_cookie_t *drba_cookie;
 	cred_t *drba_cred;
 	uint64_t drba_snapobj;
 } dmu_recv_begin_arg_t;
 
 static int
 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
     uint64_t fromguid)
 {
 	uint64_t val;
 	int error;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/* temporary clone name must not exist */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
 	    8, 1, &val);
 	if (error != ENOENT)
 		return (error == 0 ? EBUSY : error);
 
 	/* new snapshot name must not exist */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 	    drba->drba_cookie->drc_tosnap, 8, 1, &val);
 	if (error != ENOENT)
 		return (error == 0 ? EEXIST : error);
 
 	/*
 	 * Check snapshot limit before receiving. We'll recheck again at the
 	 * end, but might as well abort before receiving if we're already over
 	 * the limit.
 	 *
 	 * Note that we do not check the file system limit with
 	 * dsl_dir_fscount_check because the temporary %clones don't count
 	 * against that limit.
 	 */
 	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
 	    NULL, drba->drba_cred);
 	if (error != 0)
 		return (error);
 
 	if (fromguid != 0) {
 		dsl_dataset_t *snap;
 		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 
 		/* Find snapshot in this dir that matches fromguid. */
 		while (obj != 0) {
 			error = dsl_dataset_hold_obj(dp, obj, FTAG,
 			    &snap);
 			if (error != 0)
 				return (SET_ERROR(ENODEV));
 			if (snap->ds_dir != ds->ds_dir) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
 				break;
 			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 			dsl_dataset_rele(snap, FTAG);
 		}
 		if (obj == 0)
 			return (SET_ERROR(ENODEV));
 
 		if (drba->drba_cookie->drc_force) {
 			drba->drba_snapobj = obj;
 		} else {
 			/*
 			 * If we are not forcing, there must be no
 			 * changes since fromsnap.
 			 */
 			if (dsl_dataset_modified_since_snap(ds, snap)) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ETXTBSY));
 			}
 			drba->drba_snapobj = ds->ds_prev->ds_object;
 		}
 
 		dsl_dataset_rele(snap, FTAG);
 	} else {
 		/* if full, then must be forced */
 		if (!drba->drba_cookie->drc_force)
 			return (SET_ERROR(EEXIST));
 		/* start from $ORIGIN@$ORIGIN, if supported */
 		drba->drba_snapobj = dp->dp_origin_snap != NULL ?
 		    dp->dp_origin_snap->ds_object : 0;
 	}
 
 	return (0);
 
 }
 
 static int
 dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	uint64_t fromguid = drrb->drr_fromguid;
 	int flags = drrb->drr_flags;
 	int error;
 	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	dsl_dataset_t *ds;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 	ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES ||
 	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
 		return (SET_ERROR(EINVAL));
 
 	/* Verify pool version supports SA if SA_SPILL feature set */
 	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
 		return (SET_ERROR(ENOTSUP));
 
 	if (drba->drba_cookie->drc_resumable &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
 	 * record to a plan WRITE record, so the pool must have the
 	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
 	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * The receiving code doesn't know how to translate large blocks
 	 * to smaller ones, so the pool must have the LARGE_BLOCKS
 	 * feature enabled if the stream has LARGE_BLOCKS.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
 		return (SET_ERROR(ENOTSUP));
 
 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
 
 		/* Can't recv a clone into an existing fs */
 		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = recv_begin_check_existing_impl(drba, ds, fromguid);
 		dsl_dataset_rele(ds, FTAG);
 	} else if (error == ENOENT) {
 		/* target fs does not exist; must be a full backup or clone */
 		char buf[MAXNAMELEN];
 
 		/*
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
 		 */
 		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
 		    drba->drba_origin))
 			return (SET_ERROR(ENOENT));
 
 		/*
 		 * If we're receiving a full send as a clone, and it doesn't
 		 * contain all the necessary free records and freeobject
 		 * records, reject it.
 		 */
 		if (fromguid == 0 && drba->drba_origin &&
 		    !(flags & DRR_FLAG_FREERECORDS))
 			return (SET_ERROR(EINVAL));
 
 		/* Open the parent of tofs */
 		ASSERT3U(strlen(tofs), <, MAXNAMELEN);
 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
 		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Check filesystem and snapshot limits before receiving. We'll
 		 * recheck snapshot limits again at the end (we create the
 		 * filesystems and increment those counts during begin_sync).
 		 */
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		if (drba->drba_origin != NULL) {
 			dsl_dataset_t *origin;
 			error = dsl_dataset_hold(dp, drba->drba_origin,
 			    FTAG, &origin);
 			if (error != 0) {
 				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 			if (!origin->ds_is_snapshot) {
 				dsl_dataset_rele(origin, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
 			    fromguid != 0) {
 				dsl_dataset_rele(origin, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 			dsl_dataset_rele(origin, FTAG);
 		}
 		dsl_dataset_rele(ds, FTAG);
 		error = 0;
 	}
 	return (error);
 }
 
 static void
 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 	dsl_dataset_t *ds, *newds;
 	uint64_t dsobj;
 	int error;
 	uint64_t crflags = 0;
 
 	if (drrb->drr_flags & DRR_FLAG_CI_DATA)
 		crflags |= DS_FLAG_CI_DATASET;
 
 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
 		/* create temporary clone */
 		dsl_dataset_t *snap = NULL;
 		if (drba->drba_snapobj != 0) {
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    drba->drba_snapobj, FTAG, &snap));
 		}
 		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
 		    snap, crflags, drba->drba_cred, tx);
 		if (drba->drba_snapobj != 0)
 			dsl_dataset_rele(snap, FTAG);
 		dsl_dataset_rele(ds, FTAG);
 	} else {
 		dsl_dir_t *dd;
 		const char *tail;
 		dsl_dataset_t *origin = NULL;
 
 		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
 
 		if (drba->drba_origin != NULL) {
 			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
 			    FTAG, &origin));
 		}
 
 		/* Create new dataset. */
 		dsobj = dsl_dataset_create_sync(dd,
 		    strrchr(tofs, '/') + 1,
 		    origin, crflags, drba->drba_cred, tx);
 		if (origin != NULL)
 			dsl_dataset_rele(origin, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		drba->drba_cookie->drc_newfs = B_TRUE;
 	}
 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
 	if (drba->drba_cookie->drc_resumable) {
 		dsl_dataset_zapify(newds, tx);
 		if (drrb->drr_fromguid != 0) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
 			    8, 1, &drrb->drr_fromguid, tx));
 		}
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
 		    8, 1, &drrb->drr_toguid, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
 		    1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
 		uint64_t one = 1;
 		uint64_t zero = 0;
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
 		    8, 1, &one, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
 		    8, 1, &zero, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
 		    8, 1, &zero, tx));
 		if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
 		    DMU_BACKUP_FEATURE_EMBED_DATA) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
 			    8, 1, &one, tx));
 		}
 	}
 
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
 	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	/*
 	 * If we actually created a non-clone, we need to create the
 	 * objset in our new dataset.
 	 */
 	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
 		(void) dmu_objset_create_impl(dp->dp_spa,
 		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
 	}
 
 	drba->drba_cookie->drc_ds = newds;
 
 	spa_history_log_internal_ds(newds, "receive", tx, "");
 }
 
 static int
 dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	int error;
 	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	dsl_dataset_t *ds;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 	ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES)
 		return (SET_ERROR(EINVAL));
 
 	/* Verify pool version supports SA if SA_SPILL feature set */
 	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
 	 * record to a plain WRITE record, so the pool must have the
 	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
 	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 
 	char recvname[ZFS_MAXNAMELEN];
 
 	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
 	    tofs, recv_clone_name);
 
 	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
 		/* %recv does not exist; continue in tofs */
 		error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 		if (error != 0)
 			return (error);
 	}
 
 	/* check that ds is marked inconsistent */
 	if (!DS_IS_INCONSISTENT(ds)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* check that there is resuming data, and that the toguid matches */
 	if (!dsl_dataset_is_zapified(ds)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	uint64_t val;
 	error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
 	if (error != 0 || drrb->drr_toguid != val) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check if the receive is still running.  If so, it will be owned.
 	 * Note that nothing else can own the dataset (e.g. after the receive
 	 * fails) because it will be marked inconsistent.
 	 */
 	if (dsl_dataset_has_owner(ds)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/* There should not be any snapshots of this fs yet. */
 	if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Note: resume point will be checked when we process the first WRITE
 	 * record.
 	 */
 
 	/* check that the origin matches */
 	val = 0;
 	(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
 	if (drrb->drr_fromguid != val) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	const char *tofs = drba->drba_cookie->drc_tofs;
 	dsl_dataset_t *ds;
 	uint64_t dsobj;
 	char recvname[ZFS_MAXNAMELEN];
 
 	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
 	    tofs, recv_clone_name);
 
 	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
 		/* %recv does not exist; continue in tofs */
 		VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
 		drba->drba_cookie->drc_newfs = B_TRUE;
 	}
 
 	/* clear the inconsistent flag so that we can own it */
 	ASSERT(DS_IS_INCONSISTENT(ds));
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
 	dsobj = ds->ds_object;
 	dsl_dataset_rele(ds, FTAG);
 
 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
 
 	drba->drba_cookie->drc_ds = ds;
 
 	spa_history_log_internal_ds(ds, "resume receive", tx, "");
 }
 
 /*
  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
 dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
     boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc)
 {
 	dmu_recv_begin_arg_t drba = { 0 };
 
 	bzero(drc, sizeof (dmu_recv_cookie_t));
 	drc->drc_drr_begin = drr_begin;
 	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
 	drc->drc_tosnap = tosnap;
 	drc->drc_tofs = tofs;
 	drc->drc_force = force;
 	drc->drc_resumable = resumable;
 	drc->drc_cred = CRED();
 
 	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		drc->drc_byteswap = B_TRUE;
 		fletcher_4_incremental_byteswap(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 		byteswap_record(drr_begin);
 	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
 		fletcher_4_incremental_native(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 	} else {
 		return (SET_ERROR(EINVAL));
 	}
 
 	drba.drba_origin = origin;
 	drba.drba_cookie = drc;
 	drba.drba_cred = CRED();
 
 	if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
 	    DMU_BACKUP_FEATURE_RESUMING) {
 		return (dsl_sync_task(tofs,
 		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
 		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
 	} else  {
 		return (dsl_sync_task(tofs,
 		    dmu_recv_begin_check, dmu_recv_begin_sync,
 		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
 	}
 }
 
 struct receive_record_arg {
 	dmu_replay_record_t header;
 	void *payload; /* Pointer to a buffer containing the payload */
 	/*
 	 * If the record is a write, pointer to the arc_buf_t containing the
 	 * payload.
 	 */
 	arc_buf_t *write_buf;
 	int payload_size;
 	uint64_t bytes_read; /* bytes read from stream when record created */
 	boolean_t eos_marker; /* Marks the end of the stream */
 	bqueue_node_t node;
 };
 
 struct receive_writer_arg {
 	objset_t *os;
 	boolean_t byteswap;
 	bqueue_t q;
 
 	/*
 	 * These three args are used to signal to the main thread that we're
 	 * done.
 	 */
 	kmutex_t mutex;
 	kcondvar_t cv;
 	boolean_t done;
 
 	int err;
 	/* A map from guid to dataset to help handle dedup'd streams. */
 	avl_tree_t *guid_to_ds_map;
 	boolean_t resumable;
 	uint64_t last_object, last_offset;
 	uint64_t bytes_read; /* bytes read when current record created */
 };
 
 struct objlist {
 	list_t list; /* List of struct receive_objnode. */
 	/*
 	 * Last object looked up. Used to assert that objects are being looked
 	 * up in ascending order.
 	 */
 	uint64_t last_lookup;
 };
 
 struct receive_objnode {
 	list_node_t node;
 	uint64_t object;
 };
 
 struct receive_arg  {
 	objset_t *os;
 	kthread_t *td;
 	struct file *fp;
 	uint64_t voff; /* The current offset in the stream */
 	uint64_t bytes_read;
 	/*
 	 * A record that has had its payload read in, but hasn't yet been handed
 	 * off to the worker thread.
 	 */
 	struct receive_record_arg *rrd;
 	/* A record that has had its header read in, but not its payload. */
 	struct receive_record_arg *next_rrd;
 	zio_cksum_t cksum;
 	zio_cksum_t prev_cksum;
 	int err;
 	boolean_t byteswap;
 	/* Sorted list of objects not to issue prefetches for. */
 	struct objlist ignore_objlist;
 };
 
 typedef struct guid_map_entry {
 	uint64_t	guid;
 	dsl_dataset_t	*gme_ds;
 	avl_node_t	avlnode;
 } guid_map_entry_t;
 
 static int
 guid_compare(const void *arg1, const void *arg2)
 {
 	const guid_map_entry_t *gmep1 = arg1;
 	const guid_map_entry_t *gmep2 = arg2;
 
 	if (gmep1->guid < gmep2->guid)
 		return (-1);
 	else if (gmep1->guid > gmep2->guid)
 		return (1);
 	return (0);
 }
 
 static void
 free_guid_map_onexit(void *arg)
 {
 	avl_tree_t *ca = arg;
 	void *cookie = NULL;
 	guid_map_entry_t *gmep;
 
 	while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
 		dsl_dataset_long_rele(gmep->gme_ds, gmep);
 		dsl_dataset_rele(gmep->gme_ds, gmep);
 		kmem_free(gmep, sizeof (guid_map_entry_t));
 	}
 	avl_destroy(ca);
 	kmem_free(ca, sizeof (avl_tree_t));
 }
 
 static int
 restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	aiov.iov_base = buf;
 	aiov.iov_len = len;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = len;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = off;
 	auio.uio_td = ra->td;
 #ifdef _KERNEL
 	error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
 #else
 	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
 	error = EOPNOTSUPP;
 #endif
 	*resid = auio.uio_resid;
 	return (error);
 }
 
 static int
 receive_read(struct receive_arg *ra, int len, void *buf)
 {
 	int done = 0;
 
 	/* some things will require 8-byte alignment, so everything must */
 	ASSERT0(len % 8);
 
 	while (done < len) {
 		ssize_t resid;
 
 		ra->err = restore_bytes(ra, buf + done,
 		    len - done, ra->voff, &resid);
 
 		if (resid == len - done) {
 			/*
 			 * Note: ECKSUM indicates that the receive
 			 * was interrupted and can potentially be resumed.
 			 */
 			ra->err = SET_ERROR(ECKSUM);
 		}
 		ra->voff += len - done - resid;
 		done = len - resid;
 		if (ra->err != 0)
 			return (ra->err);
 	}
 
 	ra->bytes_read += len;
 
 	ASSERT3U(done, ==, len);
 	return (0);
 }
 
 static void
 byteswap_record(dmu_replay_record_t *drr)
 {
 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 	drr->drr_type = BSWAP_32(drr->drr_type);
 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
 
 	switch (drr->drr_type) {
 	case DRR_BEGIN:
 		DO64(drr_begin.drr_magic);
 		DO64(drr_begin.drr_versioninfo);
 		DO64(drr_begin.drr_creation_time);
 		DO32(drr_begin.drr_type);
 		DO32(drr_begin.drr_flags);
 		DO64(drr_begin.drr_toguid);
 		DO64(drr_begin.drr_fromguid);
 		break;
 	case DRR_OBJECT:
 		DO64(drr_object.drr_object);
 		DO32(drr_object.drr_type);
 		DO32(drr_object.drr_bonustype);
 		DO32(drr_object.drr_blksz);
 		DO32(drr_object.drr_bonuslen);
 		DO64(drr_object.drr_toguid);
 		break;
 	case DRR_FREEOBJECTS:
 		DO64(drr_freeobjects.drr_firstobj);
 		DO64(drr_freeobjects.drr_numobjs);
 		DO64(drr_freeobjects.drr_toguid);
 		break;
 	case DRR_WRITE:
 		DO64(drr_write.drr_object);
 		DO32(drr_write.drr_type);
 		DO64(drr_write.drr_offset);
 		DO64(drr_write.drr_length);
 		DO64(drr_write.drr_toguid);
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
 		DO64(drr_write.drr_key.ddk_prop);
 		break;
 	case DRR_WRITE_BYREF:
 		DO64(drr_write_byref.drr_object);
 		DO64(drr_write_byref.drr_offset);
 		DO64(drr_write_byref.drr_length);
 		DO64(drr_write_byref.drr_toguid);
 		DO64(drr_write_byref.drr_refguid);
 		DO64(drr_write_byref.drr_refobject);
 		DO64(drr_write_byref.drr_refoffset);
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
 		    drr_key.ddk_cksum);
 		DO64(drr_write_byref.drr_key.ddk_prop);
 		break;
 	case DRR_WRITE_EMBEDDED:
 		DO64(drr_write_embedded.drr_object);
 		DO64(drr_write_embedded.drr_offset);
 		DO64(drr_write_embedded.drr_length);
 		DO64(drr_write_embedded.drr_toguid);
 		DO32(drr_write_embedded.drr_lsize);
 		DO32(drr_write_embedded.drr_psize);
 		break;
 	case DRR_FREE:
 		DO64(drr_free.drr_object);
 		DO64(drr_free.drr_offset);
 		DO64(drr_free.drr_length);
 		DO64(drr_free.drr_toguid);
 		break;
 	case DRR_SPILL:
 		DO64(drr_spill.drr_object);
 		DO64(drr_spill.drr_length);
 		DO64(drr_spill.drr_toguid);
 		break;
 	case DRR_END:
 		DO64(drr_end.drr_toguid);
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
 		break;
 	}
 
 	if (drr->drr_type != DRR_BEGIN) {
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
 	}
 
 #undef DO64
 #undef DO32
 }
 
 static inline uint8_t
 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
 {
 	if (bonus_type == DMU_OT_SA) {
 		return (1);
 	} else {
 		return (1 +
 		    ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
 	}
 }
 
 static void
 save_resume_state(struct receive_writer_arg *rwa,
     uint64_t object, uint64_t offset, dmu_tx_t *tx)
 {
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	if (!rwa->resumable)
 		return;
 
 	/*
 	 * We use ds_resume_bytes[] != 0 to indicate that we need to
 	 * update this on disk, so it must not be 0.
 	 */
 	ASSERT(rwa->bytes_read != 0);
 
 	/*
 	 * We only resume from write records, which have a valid
 	 * (non-meta-dnode) object number.
 	 */
 	ASSERT(object != 0);
 
 	/*
 	 * For resuming to work correctly, we must receive records in order,
 	 * sorted by object,offset.  This is checked by the callers, but
 	 * assert it here for good measure.
 	 */
 	ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
 	ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
 	    offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
 	ASSERT3U(rwa->bytes_read, >=,
 	    rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
 
 	rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
 	rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
 	rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
 }
 
 static int
 receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
     void *data)
 {
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
 	uint64_t object;
 	int err;
 
 	if (drro->drr_type == DMU_OT_NONE ||
 	    !DMU_OT_IS_VALID(drro->drr_type) ||
 	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
 	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
 	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
 
 	if (err != 0 && err != ENOENT)
 		return (SET_ERROR(EINVAL));
 	object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
 
 	/*
 	 * If we are losing blkptrs or changing the block size this must
 	 * be a new file instance.  We must clear out the previous file
 	 * contents before we can change this type of metadata in the dnode.
 	 */
 	if (err == 0) {
 		int nblkptr;
 
 		nblkptr = deduce_nblkptr(drro->drr_bonustype,
 		    drro->drr_bonuslen);
 
 		if (drro->drr_blksz != doi.doi_data_block_size ||
 		    nblkptr < doi.doi_nblkptr) {
 			err = dmu_free_long_range(rwa->os, drro->drr_object,
 			    0, DMU_OBJECT_END);
 			if (err != 0)
 				return (SET_ERROR(EINVAL));
 		}
 	}
 
 	tx = dmu_tx_create(rwa->os);
 	dmu_tx_hold_bonus(tx, object);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	if (object == DMU_NEW_OBJECT) {
 		/* currently free, want to be allocated */
 		err = dmu_object_claim(rwa->os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
 	} else if (drro->drr_type != doi.doi_type ||
 	    drro->drr_blksz != doi.doi_data_block_size ||
 	    drro->drr_bonustype != doi.doi_bonus_type ||
 	    drro->drr_bonuslen != doi.doi_bonus_size) {
 		/* currently allocated, but with different properties */
 		err = dmu_object_reclaim(rwa->os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
 	}
 	if (err != 0) {
 		dmu_tx_commit(tx);
 		return (SET_ERROR(EINVAL));
 	}
 
 	dmu_object_set_checksum(rwa->os, drro->drr_object,
 	    drro->drr_checksumtype, tx);
 	dmu_object_set_compress(rwa->os, drro->drr_object,
 	    drro->drr_compress, tx);
 
 	if (data != NULL) {
 		dmu_buf_t *db;
 
 		VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		bcopy(data, db->db_data, drro->drr_bonuslen);
 		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drro->drr_bonustype);
 			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
 			    drro->drr_bonuslen);
 		}
 		dmu_buf_rele(db, FTAG);
 	}
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 receive_freeobjects(struct receive_writer_arg *rwa,
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
 	int next_err = 0;
 
 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
 		return (SET_ERROR(EINVAL));
 
 	for (obj = drrfo->drr_firstobj;
 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
 	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		int err;
 
 		if (dmu_object_info(rwa->os, obj, NULL) != 0)
 			continue;
 
 		err = dmu_free_long_object(rwa->os, obj);
 		if (err != 0)
 			return (err);
 	}
 	if (next_err != ESRCH)
 		return (next_err);
 	return (0);
 }
 
 static int
 receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
     arc_buf_t *abuf)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * For resuming to work, records must be in increasing order
 	 * by (object, offset).
 	 */
 	if (drrw->drr_object < rwa->last_object ||
 	    (drrw->drr_object == rwa->last_object &&
 	    drrw->drr_offset < rwa->last_offset)) {
 		return (SET_ERROR(EINVAL));
 	}
 	rwa->last_object = drrw->drr_object;
 	rwa->last_offset = drrw->drr_offset;
 
 	if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrw->drr_object,
 	    drrw->drr_offset, drrw->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 	if (rwa->byteswap) {
 		dmu_object_byteswap_t byteswap =
 		    DMU_OT_BYTESWAP(drrw->drr_type);
 		dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
 		    drrw->drr_length);
 	}
 
 	dmu_buf_t *bonus;
 	if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
 		return (SET_ERROR(EINVAL));
 	dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
 
 	/*
 	 * Note: If the receive fails, we want the resume stream to start
 	 * with the same record that we last successfully received (as opposed
 	 * to the next record), so that we can verify that we are
 	 * resuming from the correct location.
 	 */
 	save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
 	dmu_tx_commit(tx);
 	dmu_buf_rele(bonus, FTAG);
 
 	return (0);
 }
 
 /*
  * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
  * streams to refer to a copy of the data that is already on the
  * system because it came in earlier in the stream.  This function
  * finds the earlier copy of the data, and uses that copy instead of
  * data from the stream to fulfill this write.
  */
 static int
 receive_write_byref(struct receive_writer_arg *rwa,
     struct drr_write_byref *drrwbr)
 {
 	dmu_tx_t *tx;
 	int err;
 	guid_map_entry_t gmesrch;
 	guid_map_entry_t *gmep;
 	avl_index_t where;
 	objset_t *ref_os = NULL;
 	dmu_buf_t *dbp;
 
 	if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If the GUID of the referenced dataset is different from the
 	 * GUID of the target dataset, find the referenced dataset.
 	 */
 	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
 		gmesrch.guid = drrwbr->drr_refguid;
 		if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
 		    &where)) == NULL) {
 			return (SET_ERROR(EINVAL));
 		}
 		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
 			return (SET_ERROR(EINVAL));
 	} else {
 		ref_os = rwa->os;
 	}
 
 	err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
 	    drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 	dmu_write(rwa->os, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
 	dmu_buf_rele(dbp, FTAG);
 
 	/* See comment in restore_write. */
 	save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
 receive_write_embedded(struct receive_writer_arg *rwa,
     struct drr_write_embedded *drrwe, void *data)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
 		return (EINVAL);
 
 	if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
 		return (EINVAL);
 
 	if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
 		return (EINVAL);
 	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
 		return (EINVAL);
 
 	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwe->drr_object,
 	    drrwe->drr_offset, drrwe->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	dmu_write_embedded(rwa->os, drrwe->drr_object,
 	    drrwe->drr_offset, data, drrwe->drr_etype,
 	    drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
 	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
 
 	/* See comment in restore_write. */
 	save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
 receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
     void *data)
 {
 	dmu_tx_t *tx;
 	dmu_buf_t *db, *db_spill;
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
 	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
 		return (SET_ERROR(EINVAL));
 
 	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
 	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
 		dmu_buf_rele(db, FTAG);
 		return (err);
 	}
 
 	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_spill(tx, db->db_object);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_buf_rele(db, FTAG);
 		dmu_buf_rele(db_spill, FTAG);
 		dmu_tx_abort(tx);
 		return (err);
 	}
 	dmu_buf_will_dirty(db_spill, tx);
 
 	if (db_spill->db_size < drrs->drr_length)
 		VERIFY(0 == dbuf_spill_set_blksz(db_spill,
 		    drrs->drr_length, tx));
 	bcopy(data, db_spill->db_data, drrs->drr_length);
 
 	dmu_buf_rele(db, FTAG);
 	dmu_buf_rele(db_spill, FTAG);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
 {
 	int err;
 
 	if (drrf->drr_length != -1ULL &&
 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	err = dmu_free_long_range(rwa->os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
 
 	return (err);
 }
 
 /* used to destroy the drc_ds on error */
 static void
 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 {
 	if (drc->drc_resumable) {
 		/* wait for our resume state to be written to disk */
 		txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
 		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
 	} else {
 		char name[MAXNAMELEN];
 		dsl_dataset_name(drc->drc_ds, name);
 		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
 		(void) dsl_destroy_head(name);
 	}
 }
 
 static void
 receive_cksum(struct receive_arg *ra, int len, void *buf)
 {
 	if (ra->byteswap) {
 		fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
 	} else {
 		fletcher_4_incremental_native(buf, len, &ra->cksum);
 	}
 }
 
 /*
  * Read the payload into a buffer of size len, and update the current record's
  * payload field.
  * Allocate ra->next_rrd and read the next record's header into
  * ra->next_rrd->header.
  * Verify checksum of payload and next record.
  */
 static int
 receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
 {
 	int err;
 
 	if (len != 0) {
 		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
 		err = receive_read(ra, len, buf);
 		if (err != 0)
 			return (err);
 		receive_cksum(ra, len, buf);
 
 		/* note: rrd is NULL when reading the begin record's payload */
 		if (ra->rrd != NULL) {
 			ra->rrd->payload = buf;
 			ra->rrd->payload_size = len;
 			ra->rrd->bytes_read = ra->bytes_read;
 		}
 	}
 
 	ra->prev_cksum = ra->cksum;
 
 	ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
 	err = receive_read(ra, sizeof (ra->next_rrd->header),
 	    &ra->next_rrd->header);
 	ra->next_rrd->bytes_read = ra->bytes_read;
 	if (err != 0) {
 		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
 		ra->next_rrd = NULL;
 		return (err);
 	}
 	if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
 		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
 		ra->next_rrd = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Note: checksum is of everything up to but not including the
 	 * checksum itself.
 	 */
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	receive_cksum(ra,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    &ra->next_rrd->header);
 
 	zio_cksum_t cksum_orig =
 	    ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
 	zio_cksum_t *cksump =
 	    &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
 
 	if (ra->byteswap)
 		byteswap_record(&ra->next_rrd->header);
 
 	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
 	    !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
 		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
 		ra->next_rrd = NULL;
 		return (SET_ERROR(ECKSUM));
 	}
 
 	receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
 
 	return (0);
 }
 
 static void
 objlist_create(struct objlist *list)
 {
 	list_create(&list->list, sizeof (struct receive_objnode),
 	    offsetof(struct receive_objnode, node));
 	list->last_lookup = 0;
 }
 
 static void
 objlist_destroy(struct objlist *list)
 {
 	for (struct receive_objnode *n = list_remove_head(&list->list);
 	    n != NULL; n = list_remove_head(&list->list)) {
 		kmem_free(n, sizeof (*n));
 	}
 	list_destroy(&list->list);
 }
 
 /*
  * This function looks through the objlist to see if the specified object number
  * is contained in the objlist.  In the process, it will remove all object
  * numbers in the list that are smaller than the specified object number.  Thus,
  * any lookup of an object number smaller than a previously looked up object
  * number will always return false; therefore, all lookups should be done in
  * ascending order.
  */
 static boolean_t
 objlist_exists(struct objlist *list, uint64_t object)
 {
 	struct receive_objnode *node = list_head(&list->list);
 	ASSERT3U(object, >=, list->last_lookup);
 	list->last_lookup = object;
 	while (node != NULL && node->object < object) {
 		VERIFY3P(node, ==, list_remove_head(&list->list));
 		kmem_free(node, sizeof (*node));
 		node = list_head(&list->list);
 	}
 	return (node != NULL && node->object == object);
 }
 
 /*
  * The objlist is a list of object numbers stored in ascending order.  However,
  * the insertion of new object numbers does not seek out the correct location to
  * store a new object number; instead, it appends it to the list for simplicity.
  * Thus, any users must take care to only insert new object numbers in ascending
  * order.
  */
 static void
 objlist_insert(struct objlist *list, uint64_t object)
 {
 	struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
 	node->object = object;
 #ifdef ZFS_DEBUG
 	struct receive_objnode *last_object = list_tail(&list->list);
 	uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
 	ASSERT3U(node->object, >, last_objnum);
 #endif
 	list_insert_tail(&list->list, node);
 }
 
 /*
  * Issue the prefetch reads for any necessary indirect blocks.
  *
  * We use the object ignore list to tell us whether or not to issue prefetches
  * for a given object.  We do this for both correctness (in case the blocksize
  * of an object has changed) and performance (if the object doesn't exist, don't
  * needlessly try to issue prefetches).  We also trim the list as we go through
  * the stream to prevent it from growing to an unbounded size.
  *
  * The object numbers within will always be in sorted order, and any write
  * records we see will also be in sorted order, but they're not sorted with
  * respect to each other (i.e. we can get several object records before
  * receiving each object's write records).  As a result, once we've reached a
  * given object number, we can safely remove any reference to lower object
  * numbers in the ignore list. In practice, we receive up to 32 object records
  * before receiving write records, so the list can have up to 32 nodes in it.
  */
 /* ARGSUSED */
 static void
 receive_read_prefetch(struct receive_arg *ra,
     uint64_t object, uint64_t offset, uint64_t length)
 {
 	if (!objlist_exists(&ra->ignore_objlist, object)) {
 		dmu_prefetch(ra->os, object, 1, offset, length,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 }
 
 /*
  * Read records off the stream, issuing any necessary prefetches.
  */
 static int
 receive_read_record(struct receive_arg *ra)
 {
 	int err;
 
 	switch (ra->rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
 		uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
 		void *buf = kmem_zalloc(size, KM_SLEEP);
 		dmu_object_info_t doi;
 		err = receive_read_payload_and_next_header(ra, size, buf);
 		if (err != 0) {
 			kmem_free(buf, size);
 			return (err);
 		}
 		err = dmu_object_info(ra->os, drro->drr_object, &doi);
 		/*
 		 * See receive_read_prefetch for an explanation why we're
 		 * storing this object in the ignore_obj_list.
 		 */
 		if (err == ENOENT ||
 		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
 			objlist_insert(&ra->ignore_objlist, drro->drr_object);
 			err = 0;
 		}
 		return (err);
 	}
 	case DRR_FREEOBJECTS:
 	{
 		err = receive_read_payload_and_next_header(ra, 0, NULL);
 		return (err);
 	}
 	case DRR_WRITE:
 	{
 		struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
 		arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
 		    drrw->drr_length);
 
 		err = receive_read_payload_and_next_header(ra,
 		    drrw->drr_length, abuf->b_data);
 		if (err != 0) {
 			dmu_return_arcbuf(abuf);
 			return (err);
 		}
 		ra->rrd->write_buf = abuf;
 		receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
 		    drrw->drr_length);
 		return (err);
 	}
 	case DRR_WRITE_BYREF:
 	{
 		struct drr_write_byref *drrwb =
 		    &ra->rrd->header.drr_u.drr_write_byref;
 		err = receive_read_payload_and_next_header(ra, 0, NULL);
 		receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
 		    drrwb->drr_length);
 		return (err);
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &ra->rrd->header.drr_u.drr_write_embedded;
 		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
 		void *buf = kmem_zalloc(size, KM_SLEEP);
 
 		err = receive_read_payload_and_next_header(ra, size, buf);
 		if (err != 0) {
 			kmem_free(buf, size);
 			return (err);
 		}
 
 		receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
 		    drrwe->drr_length);
 		return (err);
 	}
 	case DRR_FREE:
 	{
 		/*
 		 * It might be beneficial to prefetch indirect blocks here, but
 		 * we don't really have the data to decide for sure.
 		 */
 		err = receive_read_payload_and_next_header(ra, 0, NULL);
 		return (err);
 	}
 	case DRR_END:
 	{
 		struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
 		if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
 			return (SET_ERROR(ECKSUM));
 		return (0);
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
 		void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
 		err = receive_read_payload_and_next_header(ra, drrs->drr_length,
 		    buf);
 		if (err != 0)
 			kmem_free(buf, drrs->drr_length);
 		return (err);
 	}
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 }
 
 /*
  * Commit the records to the pool.
  */
 static int
 receive_process_record(struct receive_writer_arg *rwa,
     struct receive_record_arg *rrd)
 {
 	int err;
 
 	/* Processing in order, therefore bytes_read should be increasing. */
 	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
 	rwa->bytes_read = rrd->bytes_read;
 
 	switch (rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro = &rrd->header.drr_u.drr_object;
 		err = receive_object(rwa, drro, rrd->payload);
 		kmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		return (err);
 	}
 	case DRR_FREEOBJECTS:
 	{
 		struct drr_freeobjects *drrfo =
 		    &rrd->header.drr_u.drr_freeobjects;
 		return (receive_freeobjects(rwa, drrfo));
 	}
 	case DRR_WRITE:
 	{
 		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 		err = receive_write(rwa, drrw, rrd->write_buf);
 		/* if receive_write() is successful, it consumes the arc_buf */
 		if (err != 0)
 			dmu_return_arcbuf(rrd->write_buf);
 		rrd->write_buf = NULL;
 		rrd->payload = NULL;
 		return (err);
 	}
 	case DRR_WRITE_BYREF:
 	{
 		struct drr_write_byref *drrwbr =
 		    &rrd->header.drr_u.drr_write_byref;
 		return (receive_write_byref(rwa, drrwbr));
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &rrd->header.drr_u.drr_write_embedded;
 		err = receive_write_embedded(rwa, drrwe, rrd->payload);
 		kmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		return (err);
 	}
 	case DRR_FREE:
 	{
 		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
 		return (receive_free(rwa, drrf));
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
 		err = receive_spill(rwa, drrs, rrd->payload);
 		kmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		return (err);
 	}
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 }
 
 /*
  * dmu_recv_stream's worker thread; pull records off the queue, and then call
  * receive_process_record  When we're done, signal the main thread and exit.
  */
 static void
 receive_writer_thread(void *arg)
 {
 	struct receive_writer_arg *rwa = arg;
 	struct receive_record_arg *rrd;
 	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
 	    rrd = bqueue_dequeue(&rwa->q)) {
 		/*
 		 * If there's an error, the main thread will stop putting things
 		 * on the queue, but we need to clear everything in it before we
 		 * can exit.
 		 */
 		if (rwa->err == 0) {
 			rwa->err = receive_process_record(rwa, rrd);
 		} else if (rrd->write_buf != NULL) {
 			dmu_return_arcbuf(rrd->write_buf);
 			rrd->write_buf = NULL;
 			rrd->payload = NULL;
 		} else if (rrd->payload != NULL) {
 			kmem_free(rrd->payload, rrd->payload_size);
 			rrd->payload = NULL;
 		}
 		kmem_free(rrd, sizeof (*rrd));
 	}
 	kmem_free(rrd, sizeof (*rrd));
 	mutex_enter(&rwa->mutex);
 	rwa->done = B_TRUE;
 	cv_signal(&rwa->cv);
 	mutex_exit(&rwa->mutex);
 	thread_exit();
 }
 
 static int
 resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
 {
 	uint64_t val;
 	objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
 	uint64_t dsobj = dmu_objset_id(ra->os);
 	uint64_t resume_obj, resume_off;
 
 	if (nvlist_lookup_uint64(begin_nvl,
 	    "resume_object", &resume_obj) != 0 ||
 	    nvlist_lookup_uint64(begin_nvl,
 	    "resume_offset", &resume_off) != 0) {
 		return (SET_ERROR(EINVAL));
 	}
 	VERIFY0(zap_lookup(mos, dsobj,
 	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
 	if (resume_obj != val)
 		return (SET_ERROR(EINVAL));
 	VERIFY0(zap_lookup(mos, dsobj,
 	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
 	if (resume_off != val)
 		return (SET_ERROR(EINVAL));
 
 	return (0);
 }
 
 /*
  * Read in the stream's records, one by one, and apply them to the pool.  There
  * are two threads involved; the thread that calls this function will spin up a
  * worker thread, read the records off the stream one by one, and issue
  * prefetches for any necessary indirect blocks.  It will then push the records
  * onto an internal blocking queue.  The worker thread will pull the records off
  * the queue, and actually write the data into the DMU.  This way, the worker
  * thread doesn't have to wait for reads to complete, since everything it needs
  * (the indirect blocks) will be prefetched.
  *
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
 dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
     int cleanup_fd, uint64_t *action_handlep)
 {
 	int err = 0;
 	struct receive_arg ra = { 0 };
 	struct receive_writer_arg rwa = { 0 };
 	int featureflags;
 	nvlist_t *begin_nvl = NULL;
 
 	ra.byteswap = drc->drc_byteswap;
 	ra.cksum = drc->drc_cksum;
 	ra.td = curthread;
 	ra.fp = fp;
 	ra.voff = *voffp;
 
 	if (dsl_dataset_is_zapified(drc->drc_ds)) {
 		(void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
 		    drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
 		    sizeof (ra.bytes_read), 1, &ra.bytes_read);
 	}
 
 	objlist_create(&ra.ignore_objlist);
 
 	/* these were verified in dmu_recv_begin */
 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
 	    DMU_SUBSTREAM);
 	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
 
 	/*
 	 * Open the objset we are modifying.
 	 */
 	VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os));
 
 	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
 
 	featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
 	/* if this stream is dedup'ed, set up the avl tree for guid mapping */
 	if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
 		minor_t minor;
 
 		if (cleanup_fd == -1) {
 			ra.err = SET_ERROR(EBADF);
 			goto out;
 		}
 		ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
 		if (ra.err != 0) {
 			cleanup_fd = -1;
 			goto out;
 		}
 
 		if (*action_handlep == 0) {
 			rwa.guid_to_ds_map =
 			    kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 			avl_create(rwa.guid_to_ds_map, guid_compare,
 			    sizeof (guid_map_entry_t),
 			    offsetof(guid_map_entry_t, avlnode));
 			err = zfs_onexit_add_cb(minor,
 			    free_guid_map_onexit, rwa.guid_to_ds_map,
 			    action_handlep);
 			if (ra.err != 0)
 				goto out;
 		} else {
 			err = zfs_onexit_cb_data(minor, *action_handlep,
 			    (void **)&rwa.guid_to_ds_map);
 			if (ra.err != 0)
 				goto out;
 		}
 
 		drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
 	}
 
 	uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
 	void *payload = NULL;
 	if (payloadlen != 0)
 		payload = kmem_alloc(payloadlen, KM_SLEEP);
 
 	err = receive_read_payload_and_next_header(&ra, payloadlen, payload);
 	if (err != 0) {
 		if (payloadlen != 0)
 			kmem_free(payload, payloadlen);
 		goto out;
 	}
 	if (payloadlen != 0) {
 		err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
 		kmem_free(payload, payloadlen);
 		if (err != 0)
 			goto out;
 	}
 
 	if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
 		err = resume_check(&ra, begin_nvl);
 		if (err != 0)
 			goto out;
 	}
 
 	(void) bqueue_init(&rwa.q, zfs_recv_queue_length,
 	    offsetof(struct receive_record_arg, node));
 	cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
 	rwa.os = ra.os;
 	rwa.byteswap = drc->drc_byteswap;
 	rwa.resumable = drc->drc_resumable;
 
 	(void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0,
 	    TS_RUN, minclsyspri);
 	/*
 	 * We're reading rwa.err without locks, which is safe since we are the
 	 * only reader, and the worker thread is the only writer.  It's ok if we
 	 * miss a write for an iteration or two of the loop, since the writer
 	 * thread will keep freeing records we send it until we send it an eos
 	 * marker.
 	 *
 	 * We can leave this loop in 3 ways:  First, if rwa.err is
 	 * non-zero.  In that case, the writer thread will free the rrd we just
 	 * pushed.  Second, if  we're interrupted; in that case, either it's the
 	 * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
 	 * has been handed off to the writer thread who will free it.  Finally,
 	 * if receive_read_record fails or we're at the end of the stream, then
 	 * we free ra.rrd and exit.
 	 */
 	while (rwa.err == 0) {
 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 
 		ASSERT3P(ra.rrd, ==, NULL);
 		ra.rrd = ra.next_rrd;
 		ra.next_rrd = NULL;
 		/* Allocates and loads header into ra.next_rrd */
 		err = receive_read_record(&ra);
 
 		if (ra.rrd->header.drr_type == DRR_END || err != 0) {
 			kmem_free(ra.rrd, sizeof (*ra.rrd));
 			ra.rrd = NULL;
 			break;
 		}
 
 		bqueue_enqueue(&rwa.q, ra.rrd,
 		    sizeof (struct receive_record_arg) + ra.rrd->payload_size);
 		ra.rrd = NULL;
 	}
 	if (ra.next_rrd == NULL)
 		ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
 	ra.next_rrd->eos_marker = B_TRUE;
 	bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
 
 	mutex_enter(&rwa.mutex);
 	while (!rwa.done) {
 		cv_wait(&rwa.cv, &rwa.mutex);
 	}
 	mutex_exit(&rwa.mutex);
 
 	cv_destroy(&rwa.cv);
 	mutex_destroy(&rwa.mutex);
 	bqueue_destroy(&rwa.q);
 	if (err == 0)
 		err = rwa.err;
 
 out:
 	nvlist_free(begin_nvl);
 	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
 		zfs_onexit_fd_rele(cleanup_fd);
 
 	if (err != 0) {
 		/*
 		 * Clean up references. If receive is not resumable,
 		 * destroy what we created, so we don't leave it in
 		 * the inconsistent state.
 		 */
 		dmu_recv_cleanup_ds(drc);
 	}
 
 	*voffp = ra.voff;
 	objlist_destroy(&ra.ignore_objlist);
 	return (err);
 }
 
 static int
 dmu_recv_end_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int error;
 
 	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
 
 	if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
 		if (error != 0)
 			return (error);
 		if (drc->drc_force) {
 			/*
 			 * We will destroy any snapshots in tofs (i.e. before
 			 * origin_head) that are after the origin (which is
 			 * the snap before drc_ds, because drc_ds can not
 			 * have any snaps of its own).
 			 */
 			uint64_t obj;
 
 			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 			while (obj !=
 			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				error = dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap);
 				if (error != 0)
 					break;
 				if (snap->ds_dir != origin_head->ds_dir)
 					error = SET_ERROR(EINVAL);
 				if (error == 0)  {
 					error = dsl_destroy_snapshot_check_impl(
 					    snap, B_FALSE);
 				}
 				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_dataset_rele(snap, FTAG);
 				if (error != 0)
 					break;
 			}
 			if (error != 0) {
 				dsl_dataset_rele(origin_head, FTAG);
 				return (error);
 			}
 		}
 		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
 		    origin_head, drc->drc_force, drc->drc_owner, tx);
 		if (error != 0) {
 			dsl_dataset_rele(origin_head, FTAG);
 			return (error);
 		}
 		error = dsl_dataset_snapshot_check_impl(origin_head,
 		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
 		dsl_dataset_rele(origin_head, FTAG);
 		if (error != 0)
 			return (error);
 
 		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
 	} else {
 		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
 		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
 	}
 	return (error);
 }
 
 static void
 dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
 	    tx, "snap=%s", drc->drc_tosnap);
 
 	if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
 		    &origin_head));
 
 		if (drc->drc_force) {
 			/*
 			 * Destroy any snapshots of drc_tofs (origin_head)
 			 * after the origin (the snap before drc_ds).
 			 */
 			uint64_t obj;
 
 			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 			while (obj !=
 			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap));
 				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
 				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_destroy_snapshot_sync_impl(snap,
 				    B_FALSE, tx);
 				dsl_dataset_rele(snap, FTAG);
 			}
 		}
 		VERIFY3P(drc->drc_ds->ds_prev, ==,
 		    origin_head->ds_prev);
 
 		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
 		    origin_head, tx);
 		dsl_dataset_snapshot_sync_impl(origin_head,
 		    drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
 		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
 		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
 		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 		dsl_dataset_phys(origin_head)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dsl_dataset_rele(origin_head, FTAG);
 		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
 
 		if (drc->drc_owner != NULL)
 			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
 	} else {
 		dsl_dataset_t *ds = drc->drc_ds;
 
 		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
 		dsl_dataset_phys(ds->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
 		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
 		if (dsl_dataset_has_resume_receive_state(ds)) {
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_FROMGUID, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_OBJECT, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_OFFSET, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_BYTES, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_TOGUID, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_TONAME, tx);
 		}
 	}
 	drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
 	/*
 	 * Release the hold from dmu_recv_begin.  This must be done before
 	 * we return to open context, so that when we free the dataset's dnode,
 	 * we can evict its bonus buffer.
 	 */
 	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
 	drc->drc_ds = NULL;
 }
 
 static int
 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *snapds;
 	guid_map_entry_t *gmep;
 	int err;
 
 	ASSERT(guid_map != NULL);
 
 	err = dsl_pool_hold(name, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
 	err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
 	if (err == 0) {
 		gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
 		gmep->gme_ds = snapds;
 		avl_add(guid_map, gmep);
 		dsl_dataset_long_hold(snapds, gmep);
 	} else
 		kmem_free(gmep, sizeof (*gmep));
 
 	dsl_pool_rele(dp, FTAG);
 	return (err);
 }
 
 static int dmu_recv_end_modified_blocks = 3;
 
 static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
 	int error;
 	char name[MAXNAMELEN];
 
 #ifdef _KERNEL
 	/*
 	 * We will be destroying the ds; make sure its origin is unmounted if
 	 * necessary.
 	 */
 	dsl_dataset_name(drc->drc_ds, name);
 	zfs_destroy_unmount_origin(name);
 #endif
 
 	error = dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
 
 	if (error != 0)
 		dmu_recv_cleanup_ds(drc);
 	return (error);
 }
 
 static int
 dmu_recv_new_end(dmu_recv_cookie_t *drc)
 {
 	int error;
 
 	error = dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
 
 	if (error != 0) {
 		dmu_recv_cleanup_ds(drc);
 	} else if (drc->drc_guid_to_ds_map != NULL) {
 		(void) add_ds_to_guidmap(drc->drc_tofs,
 		    drc->drc_guid_to_ds_map,
 		    drc->drc_newsnapobj);
 	}
 	return (error);
 }
 
 int
 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 {
 	drc->drc_owner = owner;
 
 	if (drc->drc_newfs)
 		return (dmu_recv_new_end(drc));
 	else
 		return (dmu_recv_existing_end(drc));
 }
 
 /*
  * Return TRUE if this objset is currently being received into.
  */
 boolean_t
 dmu_objset_is_receiving(objset_t *os)
 {
 	return (os->os_dsl_dataset != NULL &&
 	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	(revision 296519)
@@ -1,1659 +1,1660 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
 #include <sys/dsl_pool.h>
 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
 #include <sys/spa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/varargs.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
 
 
 dmu_tx_t *
 dmu_tx_create_dd(dsl_dir_t *dd)
 {
 	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
 	tx->tx_dir = dd;
 	if (dd != NULL)
 		tx->tx_pool = dd->dd_pool;
 	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
 	    offsetof(dmu_tx_hold_t, txh_node));
 	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
 	    offsetof(dmu_tx_callback_t, dcb_node));
 	tx->tx_start = gethrtime();
 #ifdef ZFS_DEBUG
 	refcount_create(&tx->tx_space_written);
 	refcount_create(&tx->tx_space_freed);
 #endif
 	return (tx);
 }
 
 dmu_tx_t *
 dmu_tx_create(objset_t *os)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
 	tx->tx_objset = os;
 	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
 	return (tx);
 }
 
 dmu_tx_t *
 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
 
 	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
 	tx->tx_pool = dp;
 	tx->tx_txg = txg;
 	tx->tx_anyobj = TRUE;
 
 	return (tx);
 }
 
 int
 dmu_tx_is_syncing(dmu_tx_t *tx)
 {
 	return (tx->tx_anyobj);
 }
 
 int
 dmu_tx_private_ok(dmu_tx_t *tx)
 {
 	return (tx->tx_anyobj);
 }
 
 static dmu_tx_hold_t *
 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
     enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn = NULL;
 	int err;
 
 	if (object != DMU_NEW_OBJECT) {
 		err = dnode_hold(os, object, tx, &dn);
 		if (err) {
 			tx->tx_err = err;
 			return (NULL);
 		}
 
 		if (err == 0 && tx->tx_txg != 0) {
 			mutex_enter(&dn->dn_mtx);
 			/*
 			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 			 * problem, but there's no way for it to happen (for
 			 * now, at least).
 			 */
 			ASSERT(dn->dn_assigned_txg == 0);
 			dn->dn_assigned_txg = tx->tx_txg;
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
 	}
 
 	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 	txh->txh_tx = tx;
 	txh->txh_dnode = dn;
 #ifdef ZFS_DEBUG
 	txh->txh_type = type;
 	txh->txh_arg1 = arg1;
 	txh->txh_arg2 = arg2;
 #endif
 	list_insert_tail(&tx->tx_holds, txh);
 
 	return (txh);
 }
 
 void
 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 {
 	/*
 	 * If we're syncing, they can manipulate any object anyhow, and
 	 * the hold on the dnode_t can cause problems.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		(void) dmu_tx_hold_object_impl(tx, os,
 		    object, THT_NEWOBJECT, 0, 0);
 	}
 }
 
 static int
 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 {
 	int err;
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold_level(dn, level, blkid, FTAG);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 	dbuf_rele(db, FTAG);
 	return (err);
 }
 
 static void
 dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
     int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 {
 	objset_t *os = dn->dn_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	dmu_buf_impl_t *parent = NULL;
 	blkptr_t *bp = NULL;
 	uint64_t space;
 
 	if (level >= dn->dn_nlevels || history[level] == blkid)
 		return;
 
 	history[level] = blkid;
 
 	space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 
 	if (db == NULL || db == dn->dn_dbuf) {
 		ASSERT(level != 0);
 		db = NULL;
 	} else {
 		ASSERT(DB_DNODE(db) == dn);
 		ASSERT(db->db_level == level);
 		ASSERT(db->db.db_size == space);
 		ASSERT(db->db_blkid == blkid);
 		bp = db->db_blkptr;
 		parent = db->db_parent;
 	}
 
 	freeable = (bp && (freeable ||
 	    dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 
 	if (freeable)
 		txh->txh_space_tooverwrite += space;
 	else
 		txh->txh_space_towrite += space;
 	if (bp)
 		txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 
 	dmu_tx_count_twig(txh, dn, parent, level + 1,
 	    blkid >> epbs, freeable, history);
 }
 
 /* ARGSUSED */
 static void
 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	dnode_t *dn = txh->txh_dnode;
 	uint64_t start, end, i;
 	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 	int err = 0;
 
 	if (len == 0)
 		return;
 
 	min_bs = SPA_MINBLOCKSHIFT;
 	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 	min_ibs = DN_MIN_INDBLKSHIFT;
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
 	if (dn) {
 		uint64_t history[DN_MAX_LEVELS];
 		int nlvls = dn->dn_nlevels;
 		int delta;
 
 		/*
 		 * For i/o error checking, read the first and last level-0
 		 * blocks (if they are not aligned), and all the level-1 blocks.
 		 */
 		if (dn->dn_maxblkid == 0) {
 			delta = dn->dn_datablksz;
 			start = (off < dn->dn_datablksz) ? 0 : 1;
 			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 				if (err)
 					goto out;
 				delta -= off;
 			}
 		} else {
 			zio_t *zio = zio_root(dn->dn_objset->os_spa,
 			    NULL, NULL, ZIO_FLAG_CANFAIL);
 
 			/* first level-0 block */
 			start = off >> dn->dn_datablkshift;
 			if (P2PHASE(off, dn->dn_datablksz) ||
 			    len < dn->dn_datablksz) {
 				err = dmu_tx_check_ioerr(zio, dn, 0, start);
 				if (err)
 					goto out;
 			}
 
 			/* last level-0 block */
 			end = (off+len-1) >> dn->dn_datablkshift;
 			if (end != start && end <= dn->dn_maxblkid &&
 			    P2PHASE(off+len, dn->dn_datablksz)) {
 				err = dmu_tx_check_ioerr(zio, dn, 0, end);
 				if (err)
 					goto out;
 			}
 
 			/* level-1 blocks */
 			if (nlvls > 1) {
 				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 				for (i = (start>>shft)+1; i < end>>shft; i++) {
 					err = dmu_tx_check_ioerr(zio, dn, 1, i);
 					if (err)
 						goto out;
 				}
 			}
 
 			err = zio_wait(zio);
 			if (err)
 				goto out;
 			delta = P2NPHASE(off, dn->dn_datablksz);
 		}
 
 		min_ibs = max_ibs = dn->dn_indblkshift;
 		if (dn->dn_maxblkid > 0) {
 			/*
 			 * The blocksize can't change,
 			 * so we can make a more precise estimate.
 			 */
 			ASSERT(dn->dn_datablkshift != 0);
 			min_bs = max_bs = dn->dn_datablkshift;
 		} else {
 			/*
 			 * The blocksize can increase up to the recordsize,
 			 * or if it is already more than the recordsize,
 			 * up to the next power of 2.
 			 */
 			min_bs = highbit64(dn->dn_datablksz - 1);
 			max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
 		}
 
 		/*
 		 * If this write is not off the end of the file
 		 * we need to account for overwrites/unref.
 		 */
 		if (start <= dn->dn_maxblkid) {
 			for (int l = 0; l < DN_MAX_LEVELS; l++)
 				history[l] = -1ULL;
 		}
 		while (start <= dn->dn_maxblkid) {
 			dmu_buf_impl_t *db;
 
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			err = dbuf_hold_impl(dn, 0, start,
 			    FALSE, FALSE, FTAG, &db);
 			rw_exit(&dn->dn_struct_rwlock);
 
 			if (err) {
 				txh->txh_tx->tx_err = err;
 				return;
 			}
 
 			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 			    history);
 			dbuf_rele(db, FTAG);
 			if (++start > end) {
 				/*
 				 * Account for new indirects appearing
 				 * before this IO gets assigned into a txg.
 				 */
 				bits = 64 - min_bs;
 				epbs = min_ibs - SPA_BLKPTRSHIFT;
 				for (bits -= epbs * (nlvls - 1);
 				    bits >= 0; bits -= epbs)
 					txh->txh_fudge += 1ULL << max_ibs;
 				goto out;
 			}
 			off += delta;
 			if (len >= delta)
 				len -= delta;
 			delta = dn->dn_datablksz;
 		}
 	}
 
 	/*
 	 * 'end' is the last thing we will access, not one past.
 	 * This way we won't overflow when accessing the last byte.
 	 */
 	start = P2ALIGN(off, 1ULL << max_bs);
 	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 	txh->txh_space_towrite += end - start + 1;
 
 	start >>= min_bs;
 	end >>= min_bs;
 
 	epbs = min_ibs - SPA_BLKPTRSHIFT;
 
 	/*
 	 * The object contains at most 2^(64 - min_bs) blocks,
 	 * and each indirect level maps 2^epbs.
 	 */
 	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 		start >>= epbs;
 		end >>= epbs;
 		ASSERT3U(end, >=, start);
 		txh->txh_space_towrite += (end - start + 1) << max_ibs;
 		if (start != 0) {
 			/*
 			 * We also need a new blkid=0 indirect block
 			 * to reference any existing file data.
 			 */
 			txh->txh_space_towrite += 1ULL << max_ibs;
 		}
 	}
 
 out:
 	if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 	    2 * DMU_MAX_ACCESS)
 		err = SET_ERROR(EFBIG);
 
 	if (err)
 		txh->txh_tx->tx_err = err;
 }
 
 static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
 	dnode_t *dn = txh->txh_dnode;
 	dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 	uint64_t space = mdn->dn_datablksz +
 	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 
 	if (dn && dn->dn_dbuf->db_blkptr &&
 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 	    dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 		txh->txh_space_tooverwrite += space;
 		txh->txh_space_tounref += space;
 	} else {
 		txh->txh_space_towrite += space;
 		if (dn && dn->dn_dbuf->db_blkptr)
 			txh->txh_space_tounref += space;
 	}
 }
 
 void
 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(len < DMU_MAX_ACCESS);
 	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_WRITE, off, len);
 	if (txh == NULL)
 		return;
 
 	dmu_tx_count_write(txh, off, len);
 	dmu_tx_count_dnode(txh);
 }
 
 static void
 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	uint64_t blkid, nblks, lastblk;
 	uint64_t space = 0, unref = 0, skipped = 0;
 	dnode_t *dn = txh->txh_dnode;
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 	int epbs;
 	uint64_t l0span = 0, nl1blks = 0;
 
 	if (dn->dn_nlevels == 0)
 		return;
 
 	/*
 	 * The struct_rwlock protects us against dn_nlevels
 	 * changing, in case (against all odds) we manage to dirty &
 	 * sync out the changes after we check for being dirty.
 	 * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	if (dn->dn_maxblkid == 0) {
 		if (off == 0 && len >= dn->dn_datablksz) {
 			blkid = 0;
 			nblks = 1;
 		} else {
 			rw_exit(&dn->dn_struct_rwlock);
 			return;
 		}
 	} else {
 		blkid = off >> dn->dn_datablkshift;
 		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 
 		if (blkid > dn->dn_maxblkid) {
 			rw_exit(&dn->dn_struct_rwlock);
 			return;
 		}
 		if (blkid + nblks > dn->dn_maxblkid)
 			nblks = dn->dn_maxblkid - blkid + 1;
 
 	}
 	l0span = nblks;    /* save for later use to calc level > 1 overhead */
 	if (dn->dn_nlevels == 1) {
 		int i;
 		for (i = 0; i < nblks; i++) {
 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
 			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 			bp += blkid + i;
 			if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
 				space += bp_get_dsize(spa, bp);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
 		nl1blks = 1;
 		nblks = 0;
 	}
 
 	lastblk = blkid + nblks - 1;
 	while (nblks) {
 		dmu_buf_impl_t *dbuf;
 		uint64_t ibyte, new_blkid;
 		int epb = 1 << epbs;
 		int err, i, blkoff, tochk;
 		blkptr_t *bp;
 
 		ibyte = blkid << dn->dn_datablkshift;
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 		new_blkid = ibyte >> dn->dn_datablkshift;
 		if (err == ESRCH) {
 			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 			break;
 		}
 		if (err) {
 			txh->txh_tx->tx_err = err;
 			break;
 		}
 		if (new_blkid > lastblk) {
 			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 			break;
 		}
 
 		if (new_blkid > blkid) {
 			ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 			skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 			nblks -= new_blkid - blkid;
 			blkid = new_blkid;
 		}
 		blkoff = P2PHASE(blkid, epb);
 		tochk = MIN(epb - blkoff, nblks);
 
 		err = dbuf_hold_impl(dn, 1, blkid >> epbs,
 		    FALSE, FALSE, FTAG, &dbuf);
 		if (err) {
 			txh->txh_tx->tx_err = err;
 			break;
 		}
 
 		txh->txh_memory_tohold += dbuf->db.db_size;
 
 		/*
 		 * We don't check memory_tohold against DMU_MAX_ACCESS because
 		 * memory_tohold is an over-estimation (especially the >L1
 		 * indirect blocks), so it could fail.  Callers should have
 		 * already verified that they will not be holding too much
 		 * memory.
 		 */
 
 		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 		if (err != 0) {
 			txh->txh_tx->tx_err = err;
 			dbuf_rele(dbuf, FTAG);
 			break;
 		}
 
 		bp = dbuf->db.db_data;
 		bp += blkoff;
 
 		for (i = 0; i < tochk; i++) {
 			if (dsl_dataset_block_freeable(ds, &bp[i],
 			    bp[i].blk_birth)) {
 				dprintf_bp(&bp[i], "can free old%s", "");
 				space += bp_get_dsize(spa, &bp[i]);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
 		dbuf_rele(dbuf, FTAG);
 
 		++nl1blks;
 		blkid += tochk;
 		nblks -= tochk;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/*
 	 * Add in memory requirements of higher-level indirects.
 	 * This assumes a worst-possible scenario for dn_nlevels and a
 	 * worst-possible distribution of l1-blocks over the region to free.
 	 */
 	{
 		uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 		int level = 2;
 		/*
 		 * Here we don't use DN_MAX_LEVEL, but calculate it with the
 		 * given datablkshift and indblkshift. This makes the
 		 * difference between 19 and 8 on large files.
 		 */
 		int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 
 		while (level++ < maxlevel) {
 			txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 			    << dn->dn_indblkshift;
 			blkcnt = 1 + (blkcnt >> epbs);
 		}
 	}
 
 	/* account for new level 1 indirect blocks that might show up */
 	if (skipped > 0) {
 		txh->txh_fudge += skipped << dn->dn_indblkshift;
 		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 		txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 	}
 	txh->txh_space_tofree += space;
 	txh->txh_space_tounref += unref;
 }
 
 /*
  * This function marks the transaction as being a "net free".  The end
  * result is that refquotas will be disabled for this transaction, and
  * this transaction will be able to use half of the pool space overhead
  * (see dsl_pool_adjustedsize()).  Therefore this function should only
  * be called for transactions that we expect will not cause a net increase
  * in the amount of space used (but it's OK if that is occasionally not true).
  */
 void
 dmu_tx_mark_netfree(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    DMU_NEW_OBJECT, THT_FREE, 0, 0);
 
 	/*
 	 * Pretend that this operation will free 1GB of space.  This
 	 * should be large enough to cancel out the largest write.
 	 * We don't want to use something like UINT64_MAX, because that would
 	 * cause overflows when doing math with these values (e.g. in
 	 * dmu_tx_try_assign()).
 	 */
 	txh->txh_space_tofree = txh->txh_space_tounref = 1024 * 1024 * 1024;
 }
 
 void
 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
 	int err;
 	zio_t *zio;
 
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_FREE, off, len);
 	if (txh == NULL)
 		return;
 	dn = txh->txh_dnode;
 	dmu_tx_count_dnode(txh);
 
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 
 
 	/*
 	 * For i/o error checking, we read the first and last level-0
 	 * blocks if they are not aligned, and all the level-1 blocks.
 	 *
 	 * Note:  dbuf_free_range() assumes that we have not instantiated
 	 * any level-0 dbufs that will be completely freed.  Therefore we must
 	 * exercise care to not read or count the first and last blocks
 	 * if they are blocksize-aligned.
 	 */
 	if (dn->dn_datablkshift == 0) {
 		if (off != 0 || len < dn->dn_datablksz)
 			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
 	} else {
 		/* first block will be modified if it is not aligned */
 		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 			dmu_tx_count_write(txh, off, 1);
 		/* last block will be modified if it is not aligned */
 		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 			dmu_tx_count_write(txh, off+len, 1);
 	}
 
 	/*
 	 * Check level-1 blocks.
 	 */
 	if (dn->dn_nlevels > 1) {
 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 		    SPA_BLKPTRSHIFT;
 		uint64_t start = off >> shift;
 		uint64_t end = (off + len) >> shift;
 
 		ASSERT(dn->dn_indblkshift != 0);
 
 		/*
 		 * dnode_reallocate() can result in an object with indirect
 		 * blocks having an odd data block size.  In this case,
 		 * just check the single block.
 		 */
 		if (dn->dn_datablkshift == 0)
 			start = end = 0;
 
 		zio = zio_root(tx->tx_pool->dp_spa,
 		    NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (uint64_t i = start; i <= end; i++) {
 			uint64_t ibyte = i << shift;
 			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
 			if (err == ESRCH || i > end)
 				break;
 			if (err) {
 				tx->tx_err = err;
 				return;
 			}
 
 			err = dmu_tx_check_ioerr(zio, dn, 1, i);
 			if (err) {
 				tx->tx_err = err;
 				return;
 			}
 		}
 		err = zio_wait(zio);
 		if (err) {
 			tx->tx_err = err;
 			return;
 		}
 	}
 
 	dmu_tx_count_free(txh, off, len);
 }
 
 void
 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
 	dsl_dataset_phys_t *ds_phys;
 	uint64_t nblocks;
 	int epbs, err;
 
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_ZAP, add, (uintptr_t)name);
 	if (txh == NULL)
 		return;
 	dn = txh->txh_dnode;
 
 	dmu_tx_count_dnode(txh);
 
 	if (dn == NULL) {
 		/*
 		 * We will be able to fit a new object's entries into one leaf
 		 * block.  So there will be at most 2 blocks total,
 		 * including the header block.
 		 */
 		dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 		return;
 	}
 
 	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 
 	if (dn->dn_maxblkid == 0 && !add) {
 		blkptr_t *bp;
 
 		/*
 		 * If there is only one block  (i.e. this is a micro-zap)
 		 * and we are not adding anything, the accounting is simple.
 		 */
 		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 		if (err) {
 			tx->tx_err = err;
 			return;
 		}
 
 		/*
 		 * Use max block size here, since we don't know how much
 		 * the size will change between now and the dbuf dirty call.
 		 */
 		bp = &dn->dn_phys->dn_blkptr[0];
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 		    bp, bp->blk_birth))
 			txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
 		else
 			txh->txh_space_towrite += MZAP_MAX_BLKSZ;
 		if (!BP_IS_HOLE(bp))
 			txh->txh_space_tounref += MZAP_MAX_BLKSZ;
 		return;
 	}
 
 	if (dn->dn_maxblkid > 0 && name) {
 		/*
 		 * access the name in this fat-zap so that we'll check
 		 * for i/o errors to the leaf blocks, etc.
 		 */
 		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 		    8, 0, NULL);
 		if (err == EIO) {
 			tx->tx_err = err;
 			return;
 		}
 	}
 
 	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 
 	/*
 	 * If the modified blocks are scattered to the four winds,
 	 * we'll have to modify an indirect twig for each.
 	 */
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	ds_phys = dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
 	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 		if (ds_phys->ds_prev_snap_obj)
 			txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 		else
 			txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 }
 
 void
 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_BONUS, 0, 0);
 	if (txh)
 		dmu_tx_count_dnode(txh);
 }
 
 void
 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 {
 	dmu_tx_hold_t *txh;
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
 
 	txh->txh_space_towrite += space;
 }
 
 int
 dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 {
 	dmu_tx_hold_t *txh;
 	int holds = 0;
 
 	/*
 	 * By asserting that the tx is assigned, we're counting the
 	 * number of dn_tx_holds, which is the same as the number of
 	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
 	 * dn_tx_holds could be 0.
 	 */
 	ASSERT(tx->tx_txg != 0);
 
 	/* if (tx->tx_anyobj == TRUE) */
 		/* return (0); */
 
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 			holds++;
 	}
 
 	return (holds);
 }
 
 #ifdef ZFS_DEBUG
 void
 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 {
 	dmu_tx_hold_t *txh;
 	int match_object = FALSE, match_offset = FALSE;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 	ASSERT3U(dn->dn_object, ==, db->db.db_object);
 
 	if (tx->tx_anyobj) {
 		DB_DNODE_EXIT(db);
 		return;
 	}
 
 	/* XXX No checking on the meta dnode for now */
 	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 		DB_DNODE_EXIT(db);
 		return;
 	}
 
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 			match_object = TRUE;
 		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 			int datablkshift = dn->dn_datablkshift ?
 			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 			int shift = datablkshift + epbs * db->db_level;
 			uint64_t beginblk = shift >= 64 ? 0 :
 			    (txh->txh_arg1 >> shift);
 			uint64_t endblk = shift >= 64 ? 0 :
 			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 			uint64_t blkid = db->db_blkid;
 
 			/* XXX txh_arg2 better not be zero... */
 
 			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 			    txh->txh_type, beginblk, endblk);
 
 			switch (txh->txh_type) {
 			case THT_WRITE:
 				if (blkid >= beginblk && blkid <= endblk)
 					match_offset = TRUE;
 				/*
 				 * We will let this hold work for the bonus
 				 * or spill buffer so that we don't need to
 				 * hold it when creating a new object.
 				 */
 				if (blkid == DMU_BONUS_BLKID ||
 				    blkid == DMU_SPILL_BLKID)
 					match_offset = TRUE;
 				/*
 				 * They might have to increase nlevels,
 				 * thus dirtying the new TLIBs.  Or the
 				 * might have to change the block size,
 				 * thus dirying the new lvl=0 blk=0.
 				 */
 				if (blkid == 0)
 					match_offset = TRUE;
 				break;
 			case THT_FREE:
 				/*
 				 * We will dirty all the level 1 blocks in
 				 * the free range and perhaps the first and
 				 * last level 0 block.
 				 */
 				if (blkid >= beginblk && (blkid <= endblk ||
 				    txh->txh_arg2 == DMU_OBJECT_END))
 					match_offset = TRUE;
 				break;
 			case THT_SPILL:
 				if (blkid == DMU_SPILL_BLKID)
 					match_offset = TRUE;
 				break;
 			case THT_BONUS:
 				if (blkid == DMU_BONUS_BLKID)
 					match_offset = TRUE;
 				break;
 			case THT_ZAP:
 				match_offset = TRUE;
 				break;
 			case THT_NEWOBJECT:
 				match_object = TRUE;
 				break;
 			default:
 				ASSERT(!"bad txh_type");
 			}
 		}
 		if (match_object && match_offset) {
 			DB_DNODE_EXIT(db);
 			return;
 		}
 	}
 	DB_DNODE_EXIT(db);
 	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 	    (u_longlong_t)db->db.db_object, db->db_level,
 	    (u_longlong_t)db->db_blkid);
 }
 #endif
 
 /*
  * If we can't do 10 iops, something is wrong.  Let us go ahead
  * and hit zfs_dirty_data_max.
  */
 hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
 
 /*
  * We delay transactions when we've determined that the backend storage
  * isn't able to accommodate the rate of incoming writes.
  *
  * If there is already a transaction waiting, we delay relative to when
  * that transaction finishes waiting.  This way the calculated min_time
  * is independent of the number of threads concurrently executing
  * transactions.
  *
  * If we are the only waiter, wait relative to when the transaction
  * started, rather than the current time.  This credits the transaction for
  * "time already served", e.g. reading indirect blocks.
  *
  * The minimum time for a transaction to take is calculated as:
  *     min_time = scale * (dirty - min) / (max - dirty)
  *     min_time is then capped at zfs_delay_max_ns.
  *
  * The delay has two degrees of freedom that can be adjusted via tunables.
  * The percentage of dirty data at which we start to delay is defined by
  * zfs_delay_min_dirty_percent. This should typically be at or above
  * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
  * delay after writing at full speed has failed to keep up with the incoming
  * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
  * speaking, this variable determines the amount of delay at the midpoint of
  * the curve.
  *
  * delay
  *  10ms +-------------------------------------------------------------*+
  *       |                                                             *|
  *   9ms +                                                             *+
  *       |                                                             *|
  *   8ms +                                                             *+
  *       |                                                            * |
  *   7ms +                                                            * +
  *       |                                                            * |
  *   6ms +                                                            * +
  *       |                                                            * |
  *   5ms +                                                           *  +
  *       |                                                           *  |
  *   4ms +                                                           *  +
  *       |                                                           *  |
  *   3ms +                                                          *   +
  *       |                                                          *   |
  *   2ms +                                              (midpoint) *    +
  *       |                                                  |    **     |
  *   1ms +                                                  v ***       +
  *       |             zfs_delay_scale ---------->     ********         |
  *     0 +-------------------------------------*********----------------+
  *       0%                    <- zfs_dirty_data_max ->               100%
  *
  * Note that since the delay is added to the outstanding time remaining on the
  * most recent transaction, the delay is effectively the inverse of IOPS.
  * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
  * was chosen such that small changes in the amount of accumulated dirty data
  * in the first 3/4 of the curve yield relatively small differences in the
  * amount of delay.
  *
  * The effects can be easier to understand when the amount of delay is
  * represented on a log scale:
  *
  * delay
  * 100ms +-------------------------------------------------------------++
  *       +                                                              +
  *       |                                                              |
  *       +                                                             *+
  *  10ms +                                                             *+
  *       +                                                           ** +
  *       |                                              (midpoint)  **  |
  *       +                                                  |     **    +
  *   1ms +                                                  v ****      +
  *       +             zfs_delay_scale ---------->        *****         +
  *       |                                             ****             |
  *       +                                          ****                +
  * 100us +                                        **                    +
  *       +                                       *                      +
  *       |                                      *                       |
  *       +                                     *                        +
  *  10us +                                     *                        +
  *       +                                                              +
  *       |                                                              |
  *       +                                                              +
  *       +--------------------------------------------------------------+
  *       0%                    <- zfs_dirty_data_max ->               100%
  *
  * Note here that only as the amount of dirty data approaches its limit does
  * the delay start to increase rapidly. The goal of a properly tuned system
  * should be to keep the amount of dirty data out of that range by first
  * ensuring that the appropriate limits are set for the I/O scheduler to reach
  * optimal throughput on the backend storage, and then by changing the value
  * of zfs_delay_scale to increase the steepness of the curve.
  */
 static void
 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
 {
 	dsl_pool_t *dp = tx->tx_pool;
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 	hrtime_t wakeup, min_tx_time, now;
 
 	if (dirty <= delay_min_bytes)
 		return;
 
 	/*
 	 * The caller has already waited until we are under the max.
 	 * We make them pass us the amount of dirty data so we don't
 	 * have to handle the case of it being >= the max, which could
 	 * cause a divide-by-zero if it's == the max.
 	 */
 	ASSERT3U(dirty, <, zfs_dirty_data_max);
 
 	now = gethrtime();
 	min_tx_time = zfs_delay_scale *
 	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
 	if (now > tx->tx_start + min_tx_time)
 		return;
 
 	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
 
 	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
 	    uint64_t, min_tx_time);
 
 	mutex_enter(&dp->dp_lock);
 	wakeup = MAX(tx->tx_start + min_tx_time,
 	    dp->dp_last_wakeup + min_tx_time);
 	dp->dp_last_wakeup = wakeup;
 	mutex_exit(&dp->dp_lock);
 
 #ifdef _KERNEL
 #ifdef illumos
 	mutex_enter(&curthread->t_delay_lock);
 	while (cv_timedwait_hires(&curthread->t_delay_cv,
 	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
 	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
 		continue;
 	mutex_exit(&curthread->t_delay_lock);
 #else
 	pause_sbt("dmu_tx_delay", wakeup * SBT_1NS,
 	    zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE);
 #endif
 #else
 	hrtime_t delta = wakeup - gethrtime();
 	struct timespec ts;
 	ts.tv_sec = delta / NANOSEC;
 	ts.tv_nsec = delta % NANOSEC;
 	(void) nanosleep(&ts, NULL);
 #endif
 }
 
 static int
 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
 	dmu_tx_hold_t *txh;
 	spa_t *spa = tx->tx_pool->dp_spa;
 	uint64_t memory, asize, fsize, usize;
 	uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 
 	ASSERT0(tx->tx_txg);
 
 	if (tx->tx_err)
 		return (tx->tx_err);
 
 	if (spa_suspended(spa)) {
 		/*
 		 * If the user has indicated a blocking failure mode
 		 * then return ERESTART which will block in dmu_tx_wait().
 		 * Otherwise, return EIO so that an error can get
 		 * propagated back to the VOP calls.
 		 *
 		 * Note that we always honor the txg_how flag regardless
 		 * of the failuremode setting.
 		 */
 		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 		    txg_how != TXG_WAIT)
 			return (SET_ERROR(EIO));
 
 		return (SET_ERROR(ERESTART));
 	}
 
 	if (!tx->tx_waited &&
 	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
 		tx->tx_wait_dirty = B_TRUE;
 		return (SET_ERROR(ERESTART));
 	}
 
 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 	tx->tx_needassign_txh = NULL;
 
 	/*
 	 * NB: No error returns are allowed after txg_hold_open, but
 	 * before processing the dnode holds, due to the
 	 * dmu_tx_unassign() logic.
 	 */
 
 	towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
 		if (dn != NULL) {
 			mutex_enter(&dn->dn_mtx);
 			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 				mutex_exit(&dn->dn_mtx);
 				tx->tx_needassign_txh = txh;
 				return (SET_ERROR(ERESTART));
 			}
 			if (dn->dn_assigned_txg == 0)
 				dn->dn_assigned_txg = tx->tx_txg;
 			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
 		towrite += txh->txh_space_towrite;
 		tofree += txh->txh_space_tofree;
 		tooverwrite += txh->txh_space_tooverwrite;
 		tounref += txh->txh_space_tounref;
 		tohold += txh->txh_memory_tohold;
 		fudge += txh->txh_fudge;
 	}
 
 	/*
 	 * If a snapshot has been taken since we made our estimates,
 	 * assume that we won't be able to free or overwrite anything.
 	 */
 	if (tx->tx_objset &&
 	    dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 	    tx->tx_lastsnap_txg) {
 		towrite += tooverwrite;
 		tooverwrite = tofree = 0;
 	}
 
 	/* needed allocation: worst-case estimate of write space */
 	asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
 	/* freed space estimate: worst-case overwrite + free estimate */
 	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 	/* convert unrefd space to worst-case estimate */
 	usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 	/* calculate memory footprint estimate */
 	memory = towrite + tooverwrite + tohold;
 
 #ifdef ZFS_DEBUG
 	/*
 	 * Add in 'tohold' to account for our dirty holds on this memory
 	 * XXX - the "fudge" factor is to account for skipped blocks that
 	 * we missed because dnode_next_offset() misses in-core-only blocks.
 	 */
 	tx->tx_space_towrite = asize +
 	    spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
 	tx->tx_space_tofree = tofree;
 	tx->tx_space_tooverwrite = tooverwrite;
 	tx->tx_space_tounref = tounref;
 #endif
 
 	if (tx->tx_dir && asize != 0) {
 		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
 		    asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
 		if (err)
 			return (err);
 	}
 
 	return (0);
 }
 
 static void
 dmu_tx_unassign(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
 	if (tx->tx_txg == 0)
 		return;
 
 	txg_rele_to_quiesce(&tx->tx_txgh);
 
 	/*
 	 * Walk the transaction's hold list, removing the hold on the
 	 * associated dnode, and notifying waiters if the refcount drops to 0.
 	 */
 	for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
 
 		if (dn == NULL)
 			continue;
 		mutex_enter(&dn->dn_mtx);
 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 
 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 			dn->dn_assigned_txg = 0;
 			cv_broadcast(&dn->dn_notxholds);
 		}
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	txg_rele_to_sync(&tx->tx_txgh);
 
 	tx->tx_lasttried_txg = tx->tx_txg;
 	tx->tx_txg = 0;
 }
 
 /*
  * Assign tx to a transaction group.  txg_how can be one of:
  *
  * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
  *	a new one.  This should be used when you're not holding locks.
  *	It will only fail if we're truly out of space (or over quota).
  *
  * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
  *	blocking, returns immediately with ERESTART.  This should be used
  *	whenever you're holding locks.  On an ERESTART error, the caller
  *	should drop locks, do a dmu_tx_wait(tx), and try again.
  *
  * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
  *      has already been called on behalf of this operation (though
  *      most likely on a different tx).
  */
 int
 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
 	int err;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
 	    txg_how == TXG_WAITED);
 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 
 	/* If we might wait, we must not hold the config lock. */
 	ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
 
 	if (txg_how == TXG_WAITED)
 		tx->tx_waited = B_TRUE;
 
 	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
 		dmu_tx_unassign(tx);
 
 		if (err != ERESTART || txg_how != TXG_WAIT)
 			return (err);
 
 		dmu_tx_wait(tx);
 	}
 
 	txg_rele_to_quiesce(&tx->tx_txgh);
 
 	return (0);
 }
 
 void
 dmu_tx_wait(dmu_tx_t *tx)
 {
 	spa_t *spa = tx->tx_pool->dp_spa;
 	dsl_pool_t *dp = tx->tx_pool;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(!dsl_pool_config_held(tx->tx_pool));
 
 	if (tx->tx_wait_dirty) {
 		/*
 		 * dmu_tx_try_assign() has determined that we need to wait
 		 * because we've consumed much or all of the dirty buffer
 		 * space.
 		 */
 		mutex_enter(&dp->dp_lock);
 		while (dp->dp_dirty_total >= zfs_dirty_data_max)
 			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
 		uint64_t dirty = dp->dp_dirty_total;
 		mutex_exit(&dp->dp_lock);
 
 		dmu_tx_delay(tx, dirty);
 
 		tx->tx_wait_dirty = B_FALSE;
 
 		/*
 		 * Note: setting tx_waited only has effect if the caller
 		 * used TX_WAIT.  Otherwise they are going to destroy
 		 * this tx and try again.  The common case, zfs_write(),
 		 * uses TX_WAIT.
 		 */
 		tx->tx_waited = B_TRUE;
 	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
 		/*
 		 * If the pool is suspended we need to wait until it
 		 * is resumed.  Note that it's possible that the pool
 		 * has become active after this thread has tried to
 		 * obtain a tx.  If that's the case then tx_lasttried_txg
 		 * would not have been set.
 		 */
 		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
 	} else if (tx->tx_needassign_txh) {
 		/*
 		 * A dnode is assigned to the quiescing txg.  Wait for its
 		 * transaction to complete.
 		 */
 		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
 
 		mutex_enter(&dn->dn_mtx);
 		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
 			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
 		mutex_exit(&dn->dn_mtx);
 		tx->tx_needassign_txh = NULL;
 	} else {
 		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
 	}
 }
 
 void
 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
 {
 #ifdef ZFS_DEBUG
 	if (tx->tx_dir == NULL || delta == 0)
 		return;
 
 	if (delta > 0) {
 		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
 		    tx->tx_space_towrite);
 		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
 	} else {
 		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
 	}
 #endif
 }
 
 void
 dmu_tx_commit(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg != 0);
 
 	/*
 	 * Go through the transaction's hold list and remove holds on
 	 * associated dnodes, notifying waiters if no holds remain.
 	 */
 	while (txh = list_head(&tx->tx_holds)) {
 		dnode_t *dn = txh->txh_dnode;
 
 		list_remove(&tx->tx_holds, txh);
 		kmem_free(txh, sizeof (dmu_tx_hold_t));
 		if (dn == NULL)
 			continue;
 		mutex_enter(&dn->dn_mtx);
 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 
 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 			dn->dn_assigned_txg = 0;
 			cv_broadcast(&dn->dn_notxholds);
 		}
 		mutex_exit(&dn->dn_mtx);
 		dnode_rele(dn, tx);
 	}
 
 	if (tx->tx_tempreserve_cookie)
 		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
 
 	if (!list_is_empty(&tx->tx_callbacks))
 		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
 
 	if (tx->tx_anyobj == FALSE)
 		txg_rele_to_sync(&tx->tx_txgh);
 
 	list_destroy(&tx->tx_callbacks);
 	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
 	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
 	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
 	refcount_destroy_many(&tx->tx_space_written,
 	    refcount_count(&tx->tx_space_written));
 	refcount_destroy_many(&tx->tx_space_freed,
 	    refcount_count(&tx->tx_space_freed));
 #endif
 	kmem_free(tx, sizeof (dmu_tx_t));
 }
 
 void
 dmu_tx_abort(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg == 0);
 
 	while (txh = list_head(&tx->tx_holds)) {
 		dnode_t *dn = txh->txh_dnode;
 
 		list_remove(&tx->tx_holds, txh);
 		kmem_free(txh, sizeof (dmu_tx_hold_t));
 		if (dn != NULL)
 			dnode_rele(dn, tx);
 	}
 
 	/*
 	 * Call any registered callbacks with an error code.
 	 */
 	if (!list_is_empty(&tx->tx_callbacks))
 		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
 
 	list_destroy(&tx->tx_callbacks);
 	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	refcount_destroy_many(&tx->tx_space_written,
 	    refcount_count(&tx->tx_space_written));
 	refcount_destroy_many(&tx->tx_space_freed,
 	    refcount_count(&tx->tx_space_freed));
 #endif
 	kmem_free(tx, sizeof (dmu_tx_t));
 }
 
 uint64_t
 dmu_tx_get_txg(dmu_tx_t *tx)
 {
 	ASSERT(tx->tx_txg != 0);
 	return (tx->tx_txg);
 }
 
 dsl_pool_t *
 dmu_tx_pool(dmu_tx_t *tx)
 {
 	ASSERT(tx->tx_pool != NULL);
 	return (tx->tx_pool);
 }
 
 
 void
 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
 {
 	dmu_tx_callback_t *dcb;
 
 	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
 
 	dcb->dcb_func = func;
 	dcb->dcb_data = data;
 
 	list_insert_tail(&tx->tx_callbacks, dcb);
 }
 
 /*
  * Call all the commit callbacks on a list, with a given error code.
  */
 void
 dmu_tx_do_callbacks(list_t *cb_list, int error)
 {
 	dmu_tx_callback_t *dcb;
 
 	while (dcb = list_head(cb_list)) {
 		list_remove(cb_list, dcb);
 		dcb->dcb_func(dcb->dcb_data, error);
 		kmem_free(dcb, sizeof (dmu_tx_callback_t));
 	}
 }
 
 /*
  * Interface to hold a bunch of attributes.
  * used for creating new files.
  * attrsize is the total size of all attributes
  * to be added during object creation
  *
  * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
  */
 
 /*
  * hold necessary attribute name for attribute registration.
  * should be a very rare case where this is needed.  If it does
  * happen it would only happen on the first write to the file system.
  */
 static void
 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
 {
 	int i;
 
 	if (!sa->sa_need_attr_registration)
 		return;
 
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (!sa->sa_attr_table[i].sa_registered) {
 			if (sa->sa_reg_attr_obj)
 				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
 				    B_TRUE, sa->sa_attr_table[i].sa_name);
 			else
 				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
 				    B_TRUE, sa->sa_attr_table[i].sa_name);
 		}
 	}
 }
 
 
 void
 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
 {
 	dnode_t *dn;
 	dmu_tx_hold_t *txh;
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
 	    THT_SPILL, 0, 0);
 
 	dn = txh->txh_dnode;
 
 	if (dn == NULL)
 		return;
 
 	/* If blkptr doesn't exist then add space to towrite */
 	if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 		txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
 	} else {
 		blkptr_t *bp;
 
 		bp = &dn->dn_phys->dn_spill;
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 		    bp, bp->blk_birth))
 			txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
 		else
 			txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
 		if (!BP_IS_HOLE(bp))
 			txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
 	}
 }
 
 void
 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
 {
 	sa_os_t *sa = tx->tx_objset->os_sa;
 
 	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 
 	if (tx->tx_objset->os_sa->sa_master_obj == 0)
 		return;
 
 	if (tx->tx_objset->os_sa->sa_layout_attr_obj)
 		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 	else {
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 	}
 
 	dmu_tx_sa_registration_hold(sa, tx);
 
 	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
 		return;
 
 	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
 	    THT_SPILL, 0, 0);
 }
 
 /*
  * Hold SA attribute
  *
  * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
  *
  * variable_size is the total size of all variable sized attributes
  * passed to this function.  It is not the total size of all
  * variable size attributes that *may* exist on this object.
  */
 void
 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
 {
 	uint64_t object;
 	sa_os_t *sa = tx->tx_objset->os_sa;
 
 	ASSERT(hdl != NULL);
 
 	object = sa_handle_object(hdl);
 
 	dmu_tx_hold_bonus(tx, object);
 
 	if (tx->tx_objset->os_sa->sa_master_obj == 0)
 		return;
 
 	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
 	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 	}
 
 	dmu_tx_sa_registration_hold(sa, tx);
 
 	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
 		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 
 	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
 		ASSERT(tx->tx_txg == 0);
 		dmu_tx_hold_spill(tx, object);
 	} else {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		if (dn->dn_have_spill) {
 			ASSERT(tx->tx_txg == 0);
 			dmu_tx_hold_spill(tx, object);
 		}
 		DB_DNODE_EXIT(db);
 	}
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	(revision 296519)
@@ -1,2043 +1,2044 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/range_tree.h>
 
 static kmem_cache_t *dnode_cache;
 /*
  * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  * turned on when DEBUG is also defined.
  */
 #ifdef	DEBUG
 #define	DNODE_STATS
 #endif	/* DEBUG */
 
 #ifdef	DNODE_STATS
 #define	DNODE_STAT_ADD(stat)			((stat)++)
 #else
 #define	DNODE_STAT_ADD(stat)			/* nothing */
 #endif	/* DNODE_STATS */
 
 static dnode_phys_t dnode_phys_zero;
 
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 
 #ifdef illumos
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif
 
 static int
 dbuf_compare(const void *x1, const void *x2)
 {
 	const dmu_buf_impl_t *d1 = x1;
 	const dmu_buf_impl_t *d2 = x2;
 
 	if (d1->db_level < d2->db_level) {
 		return (-1);
 	}
 	if (d1->db_level > d2->db_level) {
 		return (1);
 	}
 
 	if (d1->db_blkid < d2->db_blkid) {
 		return (-1);
 	}
 	if (d1->db_blkid > d2->db_blkid) {
 		return (1);
 	}
 
 	if (d1->db_state == DB_SEARCH) {
 		ASSERT3S(d2->db_state, !=, DB_SEARCH);
 		return (-1);
 	} else if (d2->db_state == DB_SEARCH) {
 		ASSERT3S(d1->db_state, !=, DB_SEARCH);
 		return (1);
 	}
 
 	if ((uintptr_t)d1 < (uintptr_t)d2) {
 		return (-1);
 	}
 	if ((uintptr_t)d1 > (uintptr_t)d2) {
 		return (1);
 	}
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
 {
 	dnode_t *dn = arg;
 	int i;
 
 	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
 
 	/*
 	 * Every dbuf has a reference, and dropping a tracked reference is
 	 * O(number of references), so don't track dn_holds.
 	 */
 	refcount_create_untracked(&dn->dn_holds);
 	refcount_create(&dn->dn_tx_holds);
 	list_link_init(&dn->dn_link);
 
 	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
 	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
 	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
 	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
 	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
 	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
 	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_link_init(&dn->dn_dirty_link[i]);
 		dn->dn_free_ranges[i] = NULL;
 		list_create(&dn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
 	dn->dn_dirtyctx = 0;
 	dn->dn_dirtyctx_firstset = NULL;
 	dn->dn_bonus = NULL;
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_zio = NULL;
 	dn->dn_oldused = 0;
 	dn->dn_oldflags = 0;
 	dn->dn_olduid = 0;
 	dn->dn_oldgid = 0;
 	dn->dn_newuid = 0;
 	dn->dn_newgid = 0;
 	dn->dn_id_flags = 0;
 
 	dn->dn_dbufs_count = 0;
 	dn->dn_unlisted_l0_blkid = 0;
 	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	dn->dn_moved = 0;
 	POINTER_INVALIDATE(&dn->dn_objset);
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dnode_dest(void *arg, void *unused)
 {
 	int i;
 	dnode_t *dn = arg;
 
 	rw_destroy(&dn->dn_struct_rwlock);
 	mutex_destroy(&dn->dn_mtx);
 	mutex_destroy(&dn->dn_dbufs_mtx);
 	cv_destroy(&dn->dn_notxholds);
 	refcount_destroy(&dn->dn_holds);
 	refcount_destroy(&dn->dn_tx_holds);
 	ASSERT(!list_link_active(&dn->dn_link));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 		list_destroy(&dn->dn_dirty_records[i]);
 		ASSERT0(dn->dn_next_nblkptr[i]);
 		ASSERT0(dn->dn_next_nlevels[i]);
 		ASSERT0(dn->dn_next_indblkshift[i]);
 		ASSERT0(dn->dn_next_bonustype[i]);
 		ASSERT0(dn->dn_rm_spillblk[i]);
 		ASSERT0(dn->dn_next_bonuslen[i]);
 		ASSERT0(dn->dn_next_blksz[i]);
 	}
 
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_free_txg);
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT0(dn->dn_dirtyctx);
 	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
 	ASSERT3P(dn->dn_bonus, ==, NULL);
 	ASSERT(!dn->dn_have_spill);
 	ASSERT3P(dn->dn_zio, ==, NULL);
 	ASSERT0(dn->dn_oldused);
 	ASSERT0(dn->dn_oldflags);
 	ASSERT0(dn->dn_olduid);
 	ASSERT0(dn->dn_oldgid);
 	ASSERT0(dn->dn_newuid);
 	ASSERT0(dn->dn_newgid);
 	ASSERT0(dn->dn_id_flags);
 
 	ASSERT0(dn->dn_dbufs_count);
 	ASSERT0(dn->dn_unlisted_l0_blkid);
 	avl_destroy(&dn->dn_dbufs);
 }
 
 void
 dnode_init(void)
 {
 	ASSERT(dnode_cache == NULL);
 	dnode_cache = kmem_cache_create("dnode_t",
 	    sizeof (dnode_t),
 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
 	kmem_cache_set_move(dnode_cache, dnode_move);
 }
 
 void
 dnode_fini(void)
 {
 	kmem_cache_destroy(dnode_cache);
 	dnode_cache = NULL;
 }
 
 
 #ifdef ZFS_DEBUG
 void
 dnode_verify(dnode_t *dn)
 {
 	int drop_struct_lock = FALSE;
 
 	ASSERT(dn->dn_phys);
 	ASSERT(dn->dn_objset);
 	ASSERT(dn->dn_handle->dnh_dnode == dn);
 
 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
 		return;
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
 		int i;
 		ASSERT3U(dn->dn_indblkshift, >=, 0);
 		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
 		if (dn->dn_datablkshift) {
 			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
 			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
 		}
 		ASSERT3U(dn->dn_nlevels, <=, 30);
 		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
 		ASSERT3U(dn->dn_nblkptr, >=, 1);
 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 		ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		ASSERT3U(dn->dn_datablksz, ==,
 		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
 		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
 		    dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		for (i = 0; i < TXG_SIZE; i++) {
 			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
 		}
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
 	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
 	if (dn->dn_dbuf != NULL) {
 		ASSERT3P(dn->dn_phys, ==,
 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
 		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
 	}
 	if (drop_struct_lock)
 		rw_exit(&dn->dn_struct_rwlock);
 }
 #endif
 
 void
 dnode_byteswap(dnode_phys_t *dnp)
 {
 	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
 	int i;
 
 	if (dnp->dn_type == DMU_OT_NONE) {
 		bzero(dnp, sizeof (dnode_phys_t));
 		return;
 	}
 
 	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
 	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
 	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
 	dnp->dn_used = BSWAP_64(dnp->dn_used);
 
 	/*
 	 * dn_nblkptr is only one byte, so it's OK to read it in either
 	 * byte order.  We can't read dn_bouslen.
 	 */
 	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
 	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
 	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
 		buf64[i] = BSWAP_64(buf64[i]);
 
 	/*
 	 * OK to check dn_bonuslen for zero, because it won't matter if
 	 * we have the wrong byte order.  This is necessary because the
 	 * dnode dnode is smaller than a regular dnode.
 	 */
 	if (dnp->dn_bonuslen != 0) {
 		/*
 		 * Note that the bonus length calculated here may be
 		 * longer than the actual bonus buffer.  This is because
 		 * we always put the bonus buffer after the last block
 		 * pointer (instead of packing it against the end of the
 		 * dnode buffer).
 		 */
 		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
 		size_t len = DN_MAX_BONUSLEN - off;
 		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
 		dmu_object_byteswap_t byteswap =
 		    DMU_OT_BYTESWAP(dnp->dn_bonustype);
 		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
 	}
 
 	/* Swap SPILL block if we have one */
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 		byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
 
 }
 
 void
 dnode_buf_byteswap(void *vbuf, size_t size)
 {
 	dnode_phys_t *buf = vbuf;
 	int i;
 
 	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
 	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 
 	size >>= DNODE_SHIFT;
 	for (i = 0; i < size; i++) {
 		dnode_byteswap(buf);
 		buf++;
 	}
 }
 
 void
 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 {
 	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 
 	dnode_setdirty(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
 	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
 	dn->dn_bonuslen = newsize;
 	if (newsize == 0)
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
 	else
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 void
 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
 {
 	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 	dnode_setdirty(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dn->dn_bonustype = newtype;
 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 void
 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 	dnode_setdirty(dn, tx);
 	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
 	dn->dn_have_spill = B_FALSE;
 }
 
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
 	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 	dn->dn_datablksz = size;
 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 }
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
 	dnode_t *dn;
 
 	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 	dn->dn_moved = 0;
 
 	/*
 	 * Defer setting dn_objset until the dnode is ready to be a candidate
 	 * for the dnode_move() callback.
 	 */
 	dn->dn_object = object;
 	dn->dn_dbuf = db;
 	dn->dn_handle = dnh;
 	dn->dn_phys = dnp;
 
 	if (dnp->dn_datablkszsec) {
 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 	} else {
 		dn->dn_datablksz = 0;
 		dn->dn_datablkszsec = 0;
 		dn->dn_datablkshift = 0;
 	}
 	dn->dn_indblkshift = dnp->dn_indblkshift;
 	dn->dn_nlevels = dnp->dn_nlevels;
 	dn->dn_type = dnp->dn_type;
 	dn->dn_nblkptr = dnp->dn_nblkptr;
 	dn->dn_checksum = dnp->dn_checksum;
 	dn->dn_compress = dnp->dn_compress;
 	dn->dn_bonustype = dnp->dn_bonustype;
 	dn->dn_bonuslen = dnp->dn_bonuslen;
 	dn->dn_maxblkid = dnp->dn_maxblkid;
 	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 	dn->dn_id_flags = 0;
 
 	dmu_zfetch_init(&dn->dn_zfetch, dn);
 
 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	mutex_enter(&os->os_lock);
 	if (dnh->dnh_dnode != NULL) {
 		/* Lost the allocation race. */
 		mutex_exit(&os->os_lock);
 		kmem_cache_free(dnode_cache, dn);
 		return (dnh->dnh_dnode);
 	}
 
 	/*
 	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
 	 * signifies that the special dnodes have no references from
 	 * their children (the entries in os_dnodes).  This allows
 	 * dnode_destroy() to easily determine if the last child has
 	 * been removed and then complete eviction of the objset.
 	 */
 	if (!DMU_OBJECT_IS_SPECIAL(object))
 		list_insert_head(&os->os_dnodes, dn);
 	membar_producer();
 
 	/*
 	 * Everything else must be valid before assigning dn_objset
 	 * makes the dnode eligible for dnode_move().
 	 */
 	dn->dn_objset = os;
 
 	dnh->dnh_dnode = dn;
 	mutex_exit(&os->os_lock);
 
 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 	return (dn);
 }
 
 /*
  * Caller must be holding the dnode handle, which is released upon return.
  */
 static void
 dnode_destroy(dnode_t *dn)
 {
 	objset_t *os = dn->dn_objset;
 	boolean_t complete_os_eviction = B_FALSE;
 
 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
 	mutex_enter(&os->os_lock);
 	POINTER_INVALIDATE(&dn->dn_objset);
 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		list_remove(&os->os_dnodes, dn);
 		complete_os_eviction =
 		    list_is_empty(&os->os_dnodes) &&
 		    list_link_active(&os->os_evicting_node);
 	}
 	mutex_exit(&os->os_lock);
 
 	/* the dnode can no longer move, so we can release the handle */
 	zrl_remove(&dn->dn_handle->dnh_zrlock);
 
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
 
 	dn->dn_dirtyctx = 0;
 	if (dn->dn_dirtyctx_firstset != NULL) {
 		kmem_free(dn->dn_dirtyctx_firstset, 1);
 		dn->dn_dirtyctx_firstset = NULL;
 	}
 	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
 		dbuf_evict(dn->dn_bonus);
 		dn->dn_bonus = NULL;
 	}
 	dn->dn_zio = NULL;
 
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_oldused = 0;
 	dn->dn_oldflags = 0;
 	dn->dn_olduid = 0;
 	dn->dn_oldgid = 0;
 	dn->dn_newuid = 0;
 	dn->dn_newgid = 0;
 	dn->dn_id_flags = 0;
 	dn->dn_unlisted_l0_blkid = 0;
 
 	dmu_zfetch_fini(&dn->dn_zfetch);
 	kmem_cache_free(dnode_cache, dn);
 	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
 
 	if (complete_os_eviction)
 		dmu_objset_evict_done(os);
 }
 
 void
 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	int i;
 
 	ASSERT3U(blocksize, <=,
 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (blocksize == 0)
 		blocksize = 1 << zfs_default_bs;
 	else
 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
 	if (ibs == 0)
 		ibs = zfs_default_ibs;
 
 	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 
 	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
 	    dn->dn_object, tx->tx_txg, blocksize, ibs);
 
 	ASSERT(dn->dn_type == DMU_OT_NONE);
 	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 	ASSERT(ot != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(ot));
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 	ASSERT(dn->dn_type == DMU_OT_NONE);
 	ASSERT0(dn->dn_maxblkid);
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
 	ASSERT(avl_is_empty(&dn->dn_dbufs));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(dn->dn_next_nblkptr[i]);
 		ASSERT0(dn->dn_next_nlevels[i]);
 		ASSERT0(dn->dn_next_indblkshift[i]);
 		ASSERT0(dn->dn_next_bonuslen[i]);
 		ASSERT0(dn->dn_next_bonustype[i]);
 		ASSERT0(dn->dn_rm_spillblk[i]);
 		ASSERT0(dn->dn_next_blksz[i]);
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 	}
 
 	dn->dn_type = ot;
 	dnode_setdblksz(dn, blocksize);
 	dn->dn_indblkshift = ibs;
 	dn->dn_nlevels = 1;
 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 		dn->dn_nblkptr = 1;
 	else
 		dn->dn_nblkptr = 1 +
 		    ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	dn->dn_dirtyctx = 0;
 
 	dn->dn_free_txg = 0;
 	if (dn->dn_dirtyctx_firstset) {
 		kmem_free(dn->dn_dirtyctx_firstset, 1);
 		dn->dn_dirtyctx_firstset = NULL;
 	}
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	dn->dn_id_flags = 0;
 
 	dnode_setdirty(dn, tx);
 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 }
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	int nblkptr;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(blocksize, <=,
 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 	ASSERT(tx->tx_txg != 0);
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 
 	/* clean up any unreferenced dbufs */
 	dnode_evict_dbufs(dn);
 
 	dn->dn_id_flags = 0;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_setdirty(dn, tx);
 	if (dn->dn_datablksz != blocksize) {
 		/* change blocksize */
 		ASSERT(dn->dn_maxblkid == 0 &&
 		    (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
 		    dnode_block_freed(dn, 0)));
 		dnode_setdblksz(dn, blocksize);
 		dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 	}
 	if (dn->dn_bonuslen != bonuslen)
 		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
 
 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 		nblkptr = 1;
 	else
 		nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 	if (dn->dn_bonustype != bonustype)
 		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
 	if (dn->dn_nblkptr != nblkptr)
 		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		dbuf_rm_spill(dn, tx);
 		dnode_rm_spill(dn, tx);
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* change type */
 	dn->dn_type = ot;
 
 	/* change bonus size and type */
 	mutex_enter(&dn->dn_mtx);
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_nblkptr = nblkptr;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 
 	/* fix up the bonus db_size */
 	if (dn->dn_bonus) {
 		dn->dn_bonus->db.db_size =
 		    DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
 	}
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 }
 
 #ifdef	DNODE_STATS
 static struct {
 	uint64_t dms_dnode_invalid;
 	uint64_t dms_dnode_recheck1;
 	uint64_t dms_dnode_recheck2;
 	uint64_t dms_dnode_special;
 	uint64_t dms_dnode_handle;
 	uint64_t dms_dnode_rwlock;
 	uint64_t dms_dnode_active;
 } dnode_move_stats;
 #endif	/* DNODE_STATS */
 
 static void
 dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 {
 	int i;
 
 	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
 	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
 
 	/* Copy fields. */
 	ndn->dn_objset = odn->dn_objset;
 	ndn->dn_object = odn->dn_object;
 	ndn->dn_dbuf = odn->dn_dbuf;
 	ndn->dn_handle = odn->dn_handle;
 	ndn->dn_phys = odn->dn_phys;
 	ndn->dn_type = odn->dn_type;
 	ndn->dn_bonuslen = odn->dn_bonuslen;
 	ndn->dn_bonustype = odn->dn_bonustype;
 	ndn->dn_nblkptr = odn->dn_nblkptr;
 	ndn->dn_checksum = odn->dn_checksum;
 	ndn->dn_compress = odn->dn_compress;
 	ndn->dn_nlevels = odn->dn_nlevels;
 	ndn->dn_indblkshift = odn->dn_indblkshift;
 	ndn->dn_datablkshift = odn->dn_datablkshift;
 	ndn->dn_datablkszsec = odn->dn_datablkszsec;
 	ndn->dn_datablksz = odn->dn_datablksz;
 	ndn->dn_maxblkid = odn->dn_maxblkid;
 	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
 	    sizeof (odn->dn_next_nblkptr));
 	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
 	    sizeof (odn->dn_next_nlevels));
 	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
 	    sizeof (odn->dn_next_indblkshift));
 	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
 	    sizeof (odn->dn_next_bonustype));
 	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
 	    sizeof (odn->dn_rm_spillblk));
 	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
 	    sizeof (odn->dn_next_bonuslen));
 	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
 	    sizeof (odn->dn_next_blksz));
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_move_tail(&ndn->dn_dirty_records[i],
 		    &odn->dn_dirty_records[i]);
 	}
 	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
 	    sizeof (odn->dn_free_ranges));
 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
 	ndn->dn_free_txg = odn->dn_free_txg;
 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
 	ndn->dn_dirtyctx = odn->dn_dirtyctx;
 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 	ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
 	refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
 	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
 	ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
 	ndn->dn_bonus = odn->dn_bonus;
 	ndn->dn_have_spill = odn->dn_have_spill;
 	ndn->dn_zio = odn->dn_zio;
 	ndn->dn_oldused = odn->dn_oldused;
 	ndn->dn_oldflags = odn->dn_oldflags;
 	ndn->dn_olduid = odn->dn_olduid;
 	ndn->dn_oldgid = odn->dn_oldgid;
 	ndn->dn_newuid = odn->dn_newuid;
 	ndn->dn_newgid = odn->dn_newgid;
 	ndn->dn_id_flags = odn->dn_id_flags;
 	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
 	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
 	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
 
 	/*
 	 * Update back pointers. Updating the handle fixes the back pointer of
 	 * every descendant dbuf as well as the bonus dbuf.
 	 */
 	ASSERT(ndn->dn_handle->dnh_dnode == odn);
 	ndn->dn_handle->dnh_dnode = ndn;
 	if (ndn->dn_zfetch.zf_dnode == odn) {
 		ndn->dn_zfetch.zf_dnode = ndn;
 	}
 
 	/*
 	 * Invalidate the original dnode by clearing all of its back pointers.
 	 */
 	odn->dn_dbuf = NULL;
 	odn->dn_handle = NULL;
 	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 	odn->dn_dbufs_count = 0;
 	odn->dn_unlisted_l0_blkid = 0;
 	odn->dn_bonus = NULL;
 	odn->dn_zfetch.zf_dnode = NULL;
 
 	/*
 	 * Set the low bit of the objset pointer to ensure that dnode_move()
 	 * recognizes the dnode as invalid in any subsequent callback.
 	 */
 	POINTER_INVALIDATE(&odn->dn_objset);
 
 	/*
 	 * Satisfy the destructor.
 	 */
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_create(&odn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 		odn->dn_free_ranges[i] = NULL;
 		odn->dn_next_nlevels[i] = 0;
 		odn->dn_next_indblkshift[i] = 0;
 		odn->dn_next_bonustype[i] = 0;
 		odn->dn_rm_spillblk[i] = 0;
 		odn->dn_next_bonuslen[i] = 0;
 		odn->dn_next_blksz[i] = 0;
 	}
 	odn->dn_allocated_txg = 0;
 	odn->dn_free_txg = 0;
 	odn->dn_assigned_txg = 0;
 	odn->dn_dirtyctx = 0;
 	odn->dn_dirtyctx_firstset = NULL;
 	odn->dn_have_spill = B_FALSE;
 	odn->dn_zio = NULL;
 	odn->dn_oldused = 0;
 	odn->dn_oldflags = 0;
 	odn->dn_olduid = 0;
 	odn->dn_oldgid = 0;
 	odn->dn_newuid = 0;
 	odn->dn_newgid = 0;
 	odn->dn_id_flags = 0;
 
 	/*
 	 * Mark the dnode.
 	 */
 	ndn->dn_moved = 1;
 	odn->dn_moved = (uint8_t)-1;
 }
 
 #ifdef illumos
 #ifdef	_KERNEL
 /*ARGSUSED*/
 static kmem_cbrc_t
 dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 {
 	dnode_t *odn = buf, *ndn = newbuf;
 	objset_t *os;
 	int64_t refcount;
 	uint32_t dbufs;
 
 	/*
 	 * The dnode is on the objset's list of known dnodes if the objset
 	 * pointer is valid. We set the low bit of the objset pointer when
 	 * freeing the dnode to invalidate it, and the memory patterns written
 	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
 	 * A newly created dnode sets the objset pointer last of all to indicate
 	 * that the dnode is known and in a valid state to be moved by this
 	 * function.
 	 */
 	os = odn->dn_objset;
 	if (!POINTER_IS_VALID(os)) {
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * Ensure that the objset does not go away during the move.
 	 */
 	rw_enter(&os_lock, RW_WRITER);
 	if (os != odn->dn_objset) {
 		rw_exit(&os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * If the dnode is still valid, then so is the objset. We know that no
 	 * valid objset can be freed while we hold os_lock, so we can safely
 	 * ensure that the objset remains in use.
 	 */
 	mutex_enter(&os->os_lock);
 
 	/*
 	 * Recheck the objset pointer in case the dnode was removed just before
 	 * acquiring the lock.
 	 */
 	if (os != odn->dn_objset) {
 		mutex_exit(&os->os_lock);
 		rw_exit(&os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * At this point we know that as long as we hold os->os_lock, the dnode
 	 * cannot be freed and fields within the dnode can be safely accessed.
 	 * The objset listing this dnode cannot go away as long as this dnode is
 	 * on its list.
 	 */
 	rw_exit(&os_lock);
 	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
 		return (KMEM_CBRC_NO);
 	}
 	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
 
 	/*
 	 * Lock the dnode handle to prevent the dnode from obtaining any new
 	 * holds. This also prevents the descendant dbufs and the bonus dbuf
 	 * from accessing the dnode, so that we can discount their holds. The
 	 * handle is safe to access because we know that while the dnode cannot
 	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
 	 * safely move any dnode referenced only by dbufs.
 	 */
 	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
 	 * We need to guarantee that there is a hold for every dbuf in order to
 	 * determine whether the dnode is actively referenced. Falsely matching
 	 * a dbuf to an active hold would lead to an unsafe move. It's possible
 	 * that a thread already having an active dnode hold is about to add a
 	 * dbuf, and we can't compare hold and dbuf counts while the add is in
 	 * progress.
 	 */
 	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
 		zrl_exit(&odn->dn_handle->dnh_zrlock);
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * A dbuf may be removed (evicted) without an active dnode hold. In that
 	 * case, the dbuf count is decremented under the handle lock before the
 	 * dbuf's hold is released. This order ensures that if we count the hold
 	 * after the dbuf is removed but before its hold is released, we will
 	 * treat the unmatched hold as active and exit safely. If we count the
 	 * hold before the dbuf is removed, the hold is discounted, and the
 	 * removal is blocked until the move completes.
 	 */
 	refcount = refcount_count(&odn->dn_holds);
 	ASSERT(refcount >= 0);
 	dbufs = odn->dn_dbufs_count;
 
 	/* We can't have more dbufs than dnode holds. */
 	ASSERT3U(dbufs, <=, refcount);
 	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
 	    uint32_t, dbufs);
 
 	if (refcount > dbufs) {
 		rw_exit(&odn->dn_struct_rwlock);
 		zrl_exit(&odn->dn_handle->dnh_zrlock);
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
 		return (KMEM_CBRC_LATER);
 	}
 
 	rw_exit(&odn->dn_struct_rwlock);
 
 	/*
 	 * At this point we know that anyone with a hold on the dnode is not
 	 * actively referencing it. The dnode is known and in a valid state to
 	 * move. We're holding the locks needed to execute the critical section.
 	 */
 	dnode_move_impl(odn, ndn);
 
 	list_link_replace(&odn->dn_link, &ndn->dn_link);
 	/* If the dnode was safe to move, the refcount cannot have changed. */
 	ASSERT(refcount == refcount_count(&ndn->dn_holds));
 	ASSERT(dbufs == ndn->dn_dbufs_count);
 	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
 	mutex_exit(&os->os_lock);
 
 	return (KMEM_CBRC_YES);
 }
 #endif	/* _KERNEL */
 #endif	/* illumos */
 
 void
 dnode_special_close(dnode_handle_t *dnh)
 {
 	dnode_t *dn = dnh->dnh_dnode;
 
 	/*
 	 * Wait for final references to the dnode to clear.  This can
 	 * only happen if the arc is asyncronously evicting state that
 	 * has a hold on this dnode while we are trying to evict this
 	 * dnode.
 	 */
 	while (refcount_count(&dn->dn_holds) > 0)
 		delay(1);
 	ASSERT(dn->dn_dbuf == NULL ||
 	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
 	zrl_add(&dnh->dnh_zrlock);
 	dnode_destroy(dn); /* implicit zrl_remove() */
 	zrl_destroy(&dnh->dnh_zrlock);
 	dnh->dnh_dnode = NULL;
 }
 
 void
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
 	dnode_t *dn;
 
 	dn = dnode_create(os, dnp, NULL, object, dnh);
 	zrl_init(&dnh->dnh_zrlock);
 	DNODE_VERIFY(dn);
 }
 
 static void
 dnode_buf_pageout(void *dbu)
 {
 	dnode_children_t *children_dnodes = dbu;
 	int i;
 
 	for (i = 0; i < children_dnodes->dnc_count; i++) {
 		dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
 		dnode_t *dn;
 
 		/*
 		 * The dnode handle lock guards against the dnode moving to
 		 * another valid address, so there is no need here to guard
 		 * against changes to or from NULL.
 		 */
 		if (dnh->dnh_dnode == NULL) {
 			zrl_destroy(&dnh->dnh_zrlock);
 			continue;
 		}
 
 		zrl_add(&dnh->dnh_zrlock);
 		dn = dnh->dnh_dnode;
 		/*
 		 * If there are holds on this dnode, then there should
 		 * be holds on the dnode's containing dbuf as well; thus
 		 * it wouldn't be eligible for eviction and this function
 		 * would not have been called.
 		 */
 		ASSERT(refcount_is_zero(&dn->dn_holds));
 		ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 
 		dnode_destroy(dn); /* implicit zrl_remove() */
 		zrl_destroy(&dnh->dnh_zrlock);
 		dnh->dnh_dnode = NULL;
 	}
 	kmem_free(children_dnodes, sizeof (dnode_children_t) +
 	    children_dnodes->dnc_count * sizeof (dnode_handle_t));
 }
 
 /*
  * errors:
  * EINVAL - invalid object number.
  * EIO - i/o error.
  * succeeds even for free dnodes.
  */
 int
 dnode_hold_impl(objset_t *os, uint64_t object, int flag,
     void *tag, dnode_t **dnp)
 {
 	int epb, idx, err;
 	int drop_struct_lock = FALSE;
 	int type;
 	uint64_t blk;
 	dnode_t *mdn, *dn;
 	dmu_buf_impl_t *db;
 	dnode_children_t *children_dnodes;
 	dnode_handle_t *dnh;
 
 	/*
 	 * If you are holding the spa config lock as writer, you shouldn't
 	 * be asking the DMU to do *anything* unless it's the root pool
 	 * which may require us to read from the root filesystem while
 	 * holding some (not all) of the locks as writer.
 	 */
 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
 	    (spa_is_root(os->os_spa) &&
 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
 
 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
 		dn = (object == DMU_USERUSED_OBJECT) ?
 		    DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
 		if (dn == NULL)
 			return (SET_ERROR(ENOENT));
 		type = dn->dn_type;
 		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
 			return (SET_ERROR(ENOENT));
 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
 			return (SET_ERROR(EEXIST));
 		DNODE_VERIFY(dn);
 		(void) refcount_add(&dn->dn_holds, tag);
 		*dnp = dn;
 		return (0);
 	}
 
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return (SET_ERROR(EINVAL));
 
 	mdn = DMU_META_DNODE(os);
 	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
 
 	DNODE_VERIFY(mdn);
 
 	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
 		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 
 	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 
 	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
 		rw_exit(&mdn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
 	if (err) {
 		dbuf_rele(db, FTAG);
 		return (err);
 	}
 
 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
 	epb = db->db.db_size >> DNODE_SHIFT;
 
 	idx = object & (epb-1);
 
 	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
 	children_dnodes = dmu_buf_get_user(&db->db);
 	if (children_dnodes == NULL) {
 		int i;
 		dnode_children_t *winner;
 		children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
 		    epb * sizeof (dnode_handle_t), KM_SLEEP);
 		children_dnodes->dnc_count = epb;
 		dnh = &children_dnodes->dnc_children[0];
 		for (i = 0; i < epb; i++) {
 			zrl_init(&dnh[i].dnh_zrlock);
 		}
 		dmu_buf_init_user(&children_dnodes->dnc_dbu,
 		    dnode_buf_pageout, NULL);
 		winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
 		if (winner != NULL) {
 
 			for (i = 0; i < epb; i++) {
 				zrl_destroy(&dnh[i].dnh_zrlock);
 			}
 
 			kmem_free(children_dnodes, sizeof (dnode_children_t) +
 			    epb * sizeof (dnode_handle_t));
 			children_dnodes = winner;
 		}
 	}
 	ASSERT(children_dnodes->dnc_count == epb);
 
 	dnh = &children_dnodes->dnc_children[idx];
 	zrl_add(&dnh->dnh_zrlock);
 	dn = dnh->dnh_dnode;
 	if (dn == NULL) {
 		dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
 
 		dn = dnode_create(os, phys, db, object, dnh);
 	}
 
 	mutex_enter(&dn->dn_mtx);
 	type = dn->dn_type;
 	if (dn->dn_free_txg ||
 	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
 	    ((flag & DNODE_MUST_BE_FREE) &&
 	    (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
 		mutex_exit(&dn->dn_mtx);
 		zrl_remove(&dnh->dnh_zrlock);
 		dbuf_rele(db, FTAG);
 		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
 	}
 	if (refcount_add(&dn->dn_holds, tag) == 1)
 		dbuf_add_ref(db, dnh);
 	mutex_exit(&dn->dn_mtx);
 
 	/* Now we can rely on the hold to prevent the dnode from moving. */
 	zrl_remove(&dnh->dnh_zrlock);
 
 	DNODE_VERIFY(dn);
 	ASSERT3P(dn->dn_dbuf, ==, db);
 	ASSERT3U(dn->dn_object, ==, object);
 	dbuf_rele(db, FTAG);
 
 	*dnp = dn;
 	return (0);
 }
 
 /*
  * Return held dnode if the object is allocated, NULL if not.
  */
 int
 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
 }
 
 /*
  * Can only add a reference if there is already at least one
  * reference on the dnode.  Returns FALSE if unable to add a
  * new reference.
  */
 boolean_t
 dnode_add_ref(dnode_t *dn, void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
 	if (refcount_is_zero(&dn->dn_holds)) {
 		mutex_exit(&dn->dn_mtx);
 		return (FALSE);
 	}
 	VERIFY(1 < refcount_add(&dn->dn_holds, tag));
 	mutex_exit(&dn->dn_mtx);
 	return (TRUE);
 }
 
 void
 dnode_rele(dnode_t *dn, void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
 	dnode_rele_and_unlock(dn, tag);
 }
 
 void
 dnode_rele_and_unlock(dnode_t *dn, void *tag)
 {
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
 	dmu_buf_impl_t *db = dn->dn_dbuf;
 	dnode_handle_t *dnh = dn->dn_handle;
 
 	refs = refcount_remove(&dn->dn_holds, tag);
 	mutex_exit(&dn->dn_mtx);
 
 	/*
 	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
 	 * indirectly by dbuf_rele() while relying on the dnode handle to
 	 * prevent the dnode from moving, since releasing the last hold could
 	 * result in the dnode's parent dbuf evicting its dnode handles. For
 	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
 	 * other direct or indirect hold on the dnode must first drop the dnode
 	 * handle.
 	 */
 	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
 
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
 	if (refs == 0 && db != NULL) {
 		/*
 		 * Another thread could add a hold to the dnode handle in
 		 * dnode_hold_impl() while holding the parent dbuf. Since the
 		 * hold on the parent dbuf prevents the handle from being
 		 * destroyed, the hold on the handle is OK. We can't yet assert
 		 * that the handle has zero references, but that will be
 		 * asserted anyway when the handle gets destroyed.
 		 */
 		dbuf_rele(db, dnh);
 	}
 }
 
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
 
 	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		dsl_dataset_dirty(os->os_dsl_dataset, tx);
 		return;
 	}
 
 	DNODE_VERIFY(dn);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&dn->dn_mtx);
 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
 	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
 	mutex_exit(&dn->dn_mtx);
 #endif
 
 	/*
 	 * Determine old uid/gid when necessary
 	 */
 	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
 
 	mutex_enter(&os->os_lock);
 
 	/*
 	 * If we are already marked dirty, we're done.
 	 */
 	if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
 		mutex_exit(&os->os_lock);
 		return;
 	}
 
 	ASSERT(!refcount_is_zero(&dn->dn_holds) ||
 	    !avl_is_empty(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
 	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
 	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
 	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
 
 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
 	    dn->dn_object, txg);
 
 	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
 		list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
 	} else {
 		list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
 	}
 
 	mutex_exit(&os->os_lock);
 
 	/*
 	 * The dnode maintains a hold on its containing dbuf as
 	 * long as there are holds on it.  Each instantiated child
 	 * dbuf maintains a hold on the dnode.  When the last child
 	 * drops its hold, the dnode will drop its hold on the
 	 * containing dbuf. We add a "dirty hold" here so that the
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
 	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
 
 	(void) dbuf_dirty(dn->dn_dbuf, tx);
 
 	dsl_dataset_dirty(os->os_dsl_dataset, tx);
 }
 
 void
 dnode_free(dnode_t *dn, dmu_tx_t *tx)
 {
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
 
 	/* we should be the only holder... hopefully */
 	/* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
 
 	mutex_enter(&dn->dn_mtx);
 	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
 		mutex_exit(&dn->dn_mtx);
 		return;
 	}
 	dn->dn_free_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 
 	/*
 	 * If the dnode is already dirty, it needs to be moved from
 	 * the dirty list to the free list.
 	 */
 	mutex_enter(&dn->dn_objset->os_lock);
 	if (list_link_active(&dn->dn_dirty_link[txgoff])) {
 		list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
 		list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
 		mutex_exit(&dn->dn_objset->os_lock);
 	} else {
 		mutex_exit(&dn->dn_objset->os_lock);
 		dnode_setdirty(dn, tx);
 	}
 }
 
 /*
  * Try to change the block size for the indicated dnode.  This can only
  * succeed if there are no blocks allocated or dirty beyond first block
  */
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	int err;
 
 	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (size == 0)
 		size = SPA_MINBLOCKSIZE;
 	else
 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
 	if (ibs == dn->dn_indblkshift)
 		ibs = 0;
 
 	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
 		return (0);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	/* Check for any allocated blocks beyond the first */
 	if (dn->dn_maxblkid != 0)
 		goto fail;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = avl_first(&dn->dn_dbufs); db != NULL;
 	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
 		    db->db_blkid != DMU_SPILL_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);
 			goto fail;
 		}
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;
 
 	/* resize the old block */
 	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
 	if (err == 0)
 		dbuf_new_size(db, size, tx);
 	else if (err != ENOENT)
 		goto fail;
 
 	dnode_setdblksz(dn, size);
 	dnode_setdirty(dn, tx);
 	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
 	if (ibs) {
 		dn->dn_indblkshift = ibs;
 		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
 	}
 	/* rele after we have fixed the blocksize in the dnode */
 	if (db)
 		dbuf_rele(db, FTAG);
 
 	rw_exit(&dn->dn_struct_rwlock);
 	return (0);
 
 fail:
 	rw_exit(&dn->dn_struct_rwlock);
 	return (SET_ERROR(ENOTSUP));
 }
 
 /* read-holding callers must not rely on the lock being continuously held */
 void
 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
 {
 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
 	int epbs, new_nlevels;
 	uint64_t sz;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	ASSERT(have_read ?
 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
 	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
 	 * if we have a read-lock, check to see if we need to do any work
 	 * before upgrading to a write-lock.
 	 */
 	if (have_read) {
 		if (blkid <= dn->dn_maxblkid)
 			return;
 
 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
 			rw_exit(&dn->dn_struct_rwlock);
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		}
 	}
 
 	if (blkid <= dn->dn_maxblkid)
 		goto out;
 
 	dn->dn_maxblkid = blkid;
 
 	/*
 	 * Compute the number of levels necessary to support the new maxblkid.
 	 */
 	new_nlevels = 1;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	for (sz = dn->dn_nblkptr;
 	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
 		new_nlevels++;
 
 	if (new_nlevels > dn->dn_nlevels) {
 		int old_nlevels = dn->dn_nlevels;
 		dmu_buf_impl_t *db;
 		list_t *list;
 		dbuf_dirty_record_t *new, *dr, *dr_next;
 
 		dn->dn_nlevels = new_nlevels;
 
 		ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
 		dn->dn_next_nlevels[txgoff] = new_nlevels;
 
 		/* dirty the left indirects */
 		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
 		ASSERT(db != NULL);
 		new = dbuf_dirty(db, tx);
 		dbuf_rele(db, FTAG);
 
 		/* transfer the dirty records to the new indirect */
 		mutex_enter(&dn->dn_mtx);
 		mutex_enter(&new->dt.di.dr_mtx);
 		list = &dn->dn_dirty_records[txgoff];
 		for (dr = list_head(list); dr; dr = dr_next) {
 			dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
 			if (dr->dr_dbuf->db_level != new_nlevels-1 &&
 			    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
 			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
 				ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
 				list_remove(&dn->dn_dirty_records[txgoff], dr);
 				list_insert_tail(&new->dt.di.dr_children, dr);
 				dr->dr_parent = new;
 			}
 		}
 		mutex_exit(&new->dt.di.dr_mtx);
 		mutex_exit(&dn->dn_mtx);
 	}
 
 out:
 	if (have_read)
 		rw_downgrade(&dn->dn_struct_rwlock);
 }
 
 static void
 dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
 	if (db != NULL) {
 		dmu_buf_will_dirty(&db->db, tx);
 		dbuf_rele(db, FTAG);
 	}
 }
 
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	uint64_t blkoff, blkid, nblks;
 	int blksz, blkshift, head, tail;
 	int trunc = FALSE;
 	int epbs;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	blksz = dn->dn_datablksz;
 	blkshift = dn->dn_datablkshift;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	if (len == DMU_OBJECT_END) {
 		len = UINT64_MAX - off;
 		trunc = TRUE;
 	}
 
 	/*
 	 * First, block align the region to free:
 	 */
 	if (ISP2(blksz)) {
 		head = P2NPHASE(off, blksz);
 		blkoff = P2PHASE(off, blksz);
 		if ((off >> blkshift) > dn->dn_maxblkid)
 			goto out;
 	} else {
 		ASSERT(dn->dn_maxblkid == 0);
 		if (off == 0 && len >= blksz) {
 			/*
 			 * Freeing the whole block; fast-track this request.
 			 * Note that we won't dirty any indirect blocks,
 			 * which is fine because we will be freeing the entire
 			 * file and thus all indirect blocks will be freed
 			 * by free_children().
 			 */
 			blkid = 0;
 			nblks = 1;
 			goto done;
 		} else if (off >= blksz) {
 			/* Freeing past end-of-data */
 			goto out;
 		} else {
 			/* Freeing part of the block. */
 			head = blksz - off;
 			ASSERT3U(head, >, 0);
 		}
 		blkoff = off;
 	}
 	/* zero out any partial block data at the start of the range */
 	if (head) {
 		ASSERT3U(blkoff + head, ==, blksz);
 		if (len < head)
 			head = len;
 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
 		    TRUE, FALSE, FTAG, &db) == 0) {
 			caddr_t data;
 
 			/* don't dirty if it isn't on disk and isn't dirty */
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
 				rw_exit(&dn->dn_struct_rwlock);
 				dmu_buf_will_dirty(&db->db, tx);
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				data = db->db.db_data;
 				bzero(data + blkoff, head);
 			}
 			dbuf_rele(db, FTAG);
 		}
 		off += head;
 		len -= head;
 	}
 
 	/* If the range was less than one block, we're done */
 	if (len == 0)
 		goto out;
 
 	/* If the remaining range is past end of file, we're done */
 	if ((off >> blkshift) > dn->dn_maxblkid)
 		goto out;
 
 	ASSERT(ISP2(blksz));
 	if (trunc)
 		tail = 0;
 	else
 		tail = P2PHASE(len, blksz);
 
 	ASSERT0(P2PHASE(off, blksz));
 	/* zero out any partial block data at the end of the range */
 	if (tail) {
 		if (len < tail)
 			tail = len;
 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
 		    TRUE, FALSE, FTAG, &db) == 0) {
 			/* don't dirty if not on disk and not dirty */
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
 				rw_exit(&dn->dn_struct_rwlock);
 				dmu_buf_will_dirty(&db->db, tx);
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				bzero(db->db.db_data, tail);
 			}
 			dbuf_rele(db, FTAG);
 		}
 		len -= tail;
 	}
 
 	/* If the range did not include a full block, we are done */
 	if (len == 0)
 		goto out;
 
 	ASSERT(IS_P2ALIGNED(off, blksz));
 	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
 	blkid = off >> blkshift;
 	nblks = len >> blkshift;
 	if (trunc)
 		nblks += 1;
 
 	/*
 	 * Dirty all the indirect blocks in this range.  Note that only
 	 * the first and last indirect blocks can actually be written
 	 * (if they were partially freed) -- they must be dirtied, even if
 	 * they do not exist on disk yet.  The interior blocks will
 	 * be freed by free_children(), so they will not actually be written.
 	 * Even though these interior blocks will not be written, we
 	 * dirty them for two reasons:
 	 *
 	 *  - It ensures that the indirect blocks remain in memory until
 	 *    syncing context.  (They have already been prefetched by
 	 *    dmu_tx_hold_free(), so we don't have to worry about reading
 	 *    them serially here.)
 	 *
 	 *  - The dirty space accounting will put pressure on the txg sync
 	 *    mechanism to begin syncing, and to delay transactions if there
 	 *    is a large amount of freeing.  Even though these indirect
 	 *    blocks will not be written, we could need to write the same
 	 *    amount of space if we copy the freed BPs into deadlists.
 	 */
 	if (dn->dn_nlevels > 1) {
 		uint64_t first, last;
 
 		first = blkid >> epbs;
 		dnode_dirty_l1(dn, first, tx);
 		if (trunc)
 			last = dn->dn_maxblkid >> epbs;
 		else
 			last = (blkid + nblks - 1) >> epbs;
 		if (last != first)
 			dnode_dirty_l1(dn, last, tx);
 
 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 		    SPA_BLKPTRSHIFT;
 		for (uint64_t i = first + 1; i < last; i++) {
 			/*
 			 * Set i to the blockid of the next non-hole
 			 * level-1 indirect block at or after i.  Note
 			 * that dnode_next_offset() operates in terms of
 			 * level-0-equivalent bytes.
 			 */
 			uint64_t ibyte = i << shift;
 			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
 			    &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
 			if (i >= last)
 				break;
 
 			/*
 			 * Normally we should not see an error, either
 			 * from dnode_next_offset() or dbuf_hold_level()
 			 * (except for ESRCH from dnode_next_offset).
 			 * If there is an i/o error, then when we read
 			 * this block in syncing context, it will use
 			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
 			 * to the "failmode" property.  dnode_next_offset()
 			 * doesn't have a flag to indicate MUSTSUCCEED.
 			 */
 			if (err != 0)
 				break;
 
 			dnode_dirty_l1(dn, i, tx);
 		}
 	}
 
 done:
 	/*
 	 * Add this range to the dnode range list.
 	 * We will finish up this free operation in the syncing phase.
 	 */
 	mutex_enter(&dn->dn_mtx);
 	int txgoff = tx->tx_txg & TXG_MASK;
 	if (dn->dn_free_ranges[txgoff] == NULL) {
 		dn->dn_free_ranges[txgoff] =
 		    range_tree_create(NULL, NULL, &dn->dn_mtx);
 	}
 	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
 	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
 	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
 	    blkid, nblks, tx->tx_txg);
 	mutex_exit(&dn->dn_mtx);
 
 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
 	dnode_setdirty(dn, tx);
 out:
 
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 static boolean_t
 dnode_spill_freed(dnode_t *dn)
 {
 	int i;
 
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
 			break;
 	}
 	mutex_exit(&dn->dn_mtx);
 	return (i < TXG_SIZE);
 }
 
 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
 	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
 	int i;
 
 	if (blkid == DMU_BONUS_BLKID)
 		return (FALSE);
 
 	/*
 	 * If we're in the process of opening the pool, dp will not be
 	 * set yet, but there shouldn't be anything dirty.
 	 */
 	if (dp == NULL)
 		return (FALSE);
 
 	if (dn->dn_free_txg)
 		return (TRUE);
 
 	if (blkid == DMU_SPILL_BLKID)
 		return (dnode_spill_freed(dn));
 
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (dn->dn_free_ranges[i] != NULL &&
 		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
 			break;
 	}
 	mutex_exit(&dn->dn_mtx);
 	return (i < TXG_SIZE);
 }
 
 /* call from syncing context when we actually write/free space for this dnode */
 void
 dnode_diduse_space(dnode_t *dn, int64_t delta)
 {
 	uint64_t space;
 	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
 	    dn, dn->dn_phys,
 	    (u_longlong_t)dn->dn_phys->dn_used,
 	    (longlong_t)delta);
 
 	mutex_enter(&dn->dn_mtx);
 	space = DN_USED_BYTES(dn->dn_phys);
 	if (delta > 0) {
 		ASSERT3U(space + delta, >=, space); /* no overflow */
 	} else {
 		ASSERT3U(space, >=, -delta); /* no underflow */
 	}
 	space += delta;
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
 		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
 	} else {
 		dn->dn_phys->dn_used = space;
 		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
 	}
 	mutex_exit(&dn->dn_mtx);
 }
 
 /*
  * Call when we think we're going to write/free space in open context to track
  * the amount of memory in use by the currently open txg.
  */
 void
 dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	int64_t aspace = spa_get_asize(os->os_spa, space);
 
 	if (ds != NULL) {
 		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
 		dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
 	}
 
 	dmu_tx_willuse_space(tx, aspace);
 }
 
 /*
  * Scans a block at the indicated "level" looking for a hole or data,
  * depending on 'flags'.
  *
  * If level > 0, then we are scanning an indirect block looking at its
  * pointers.  If level == 0, then we are looking at a block of dnodes.
  *
  * If we don't find what we are looking for in the block, we return ESRCH.
  * Otherwise, return with *offset pointing to the beginning (if searching
  * forwards) or end (if searching backwards) of the range covered by the
  * block pointer we matched on (or dnode).
  *
  * The basic search algorithm used below by dnode_next_offset() is to
  * use this function to search up the block tree (widen the search) until
  * we find something (i.e., we don't return ESRCH) and then search back
  * down the tree (narrow the search) until we reach our original search
  * level.
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
     int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
 	void *data = NULL;
 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	uint64_t epb = 1ULL << epbs;
 	uint64_t minfill, maxfill;
 	boolean_t hole;
 	int i, inc, error, span;
 
 	dprintf("probing object %llu offset %llx level %d of %u\n",
 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
 
 	hole = ((flags & DNODE_FIND_HOLE) != 0);
 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
 	ASSERT(txg == 0 || !hole);
 
 	if (lvl == dn->dn_phys->dn_nlevels) {
 		error = 0;
 		epb = dn->dn_phys->dn_nblkptr;
 		data = dn->dn_phys->dn_blkptr;
 	} else {
 		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
 		if (error) {
 			if (error != ENOENT)
 				return (error);
 			if (hole)
 				return (0);
 			/*
 			 * This can only happen when we are searching up
 			 * the block tree for data.  We don't really need to
 			 * adjust the offset, as we will just end up looking
 			 * at the pointer to this block in its parent, and its
 			 * going to be unallocated, so we will skip over it.
 			 */
 			return (SET_ERROR(ESRCH));
 		}
 		error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
 		if (error) {
 			dbuf_rele(db, FTAG);
 			return (error);
 		}
 		data = db->db.db_data;
 	}
 
 
 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
 	    db->db_blkptr->blk_birth <= txg ||
 	    BP_IS_HOLE(db->db_blkptr))) {
 		/*
 		 * This can only happen when we are searching up the tree
 		 * and these conditions mean that we need to keep climbing.
 		 */
 		error = SET_ERROR(ESRCH);
 	} else if (lvl == 0) {
 		dnode_phys_t *dnp = data;
 		span = DNODE_SHIFT;
 		ASSERT(dn->dn_type == DMU_OT_DNODE);
 
 		for (i = (*offset >> span) & (blkfill - 1);
 		    i >= 0 && i < blkfill; i += inc) {
 			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
 				break;
 			*offset += (1ULL << span) * inc;
 		}
 		if (i < 0 || i == blkfill)
 			error = SET_ERROR(ESRCH);
 	} else {
 		blkptr_t *bp = data;
 		uint64_t start = *offset;
 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
 		minfill = 0;
 		maxfill = blkfill << ((lvl - 1) * epbs);
 
 		if (hole)
 			maxfill--;
 		else
 			minfill++;
 
 		*offset = *offset >> span;
 		for (i = BF64_GET(*offset, 0, epbs);
 		    i >= 0 && i < epb; i += inc) {
 			if (BP_GET_FILL(&bp[i]) >= minfill &&
 			    BP_GET_FILL(&bp[i]) <= maxfill &&
 			    (hole || bp[i].blk_birth > txg))
 				break;
 			if (inc > 0 || *offset > 0)
 				*offset += inc;
 		}
 		*offset = *offset << span;
 		if (inc < 0) {
 			/* traversing backwards; position offset at the end */
 			ASSERT3U(*offset, <=, start);
 			*offset = MIN(*offset + (1ULL << span) - 1, start);
 		} else if (*offset < start) {
 			*offset = start;
 		}
 		if (i < 0 || i >= epb)
 			error = SET_ERROR(ESRCH);
 	}
 
 	if (db)
 		dbuf_rele(db, FTAG);
 
 	return (error);
 }
 
 /*
  * Find the next hole, data, or sparse region at or after *offset.
  * The value 'blkfill' tells us how many items we expect to find
  * in an L0 data block; this value is 1 for normal objects,
  * DNODES_PER_BLOCK for the meta dnode, and some fraction of
  * DNODES_PER_BLOCK when searching for sparse regions thereof.
  *
  * Examples:
  *
  * dnode_next_offset(dn, flags, offset, 1, 1, 0);
  *	Finds the next/previous hole/data in a file.
  *	Used in dmu_offset_next().
  *
  * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
  *	Finds the next free/allocated dnode an objset's meta-dnode.
  *	Only finds objects that have new contents since txg (ie.
  *	bonus buffer changes and content removal are ignored).
  *	Used in dmu_object_next().
  *
  * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
  *	Used in dmu_object_alloc().
  */
 int
 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
     int minlvl, uint64_t blkfill, uint64_t txg)
 {
 	uint64_t initial_offset = *offset;
 	int lvl, maxlvl;
 	int error = 0;
 
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dn->dn_phys->dn_nlevels == 0) {
 		error = SET_ERROR(ESRCH);
 		goto out;
 	}
 
 	if (dn->dn_datablkshift == 0) {
 		if (*offset < dn->dn_datablksz) {
 			if (flags & DNODE_FIND_HOLE)
 				*offset = dn->dn_datablksz;
 		} else {
 			error = SET_ERROR(ESRCH);
 		}
 		goto out;
 	}
 
 	maxlvl = dn->dn_phys->dn_nlevels;
 
 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
 		if (error != ESRCH)
 			break;
 	}
 
 	while (error == 0 && --lvl >= minlvl) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
 	}
 
 	/*
 	 * There's always a "virtual hole" at the end of the object, even
 	 * if all BP's which physically exist are non-holes.
 	 */
 	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
 	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
 		error = 0;
 	}
 
 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
 	    initial_offset < *offset : initial_offset > *offset))
 		error = SET_ERROR(ESRCH);
 out:
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_exit(&dn->dn_struct_rwlock);
 
 	return (error);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 296519)
@@ -1,3616 +1,3617 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 RackTop Systems.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dmu_send.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <zfs_fletcher.h>
 
 SYSCTL_DECL(_vfs_zfs);
 
 /*
  * The SPA supports block sizes up to 16MB.  However, very large blocks
  * can have an impact on i/o latency (e.g. tying up a spinning disk for
  * ~300ms), and also potentially on the memory allocator.  Therefore,
  * we do not allow the recordsize to be set larger than zfs_max_recordsize
  * (default 1MB).  Larger blocks can be created by changing this tunable,
  * and pools with larger blocks can always be imported and used, regardless
  * of this setting.
  */
 int zfs_max_recordsize = 1 * 1024 * 1024;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
     &zfs_max_recordsize, 0,
     "Maximum block size.  Expect dragons when tuning this.");
 
 #define	SWITCH64(x, y) \
 	{ \
 		uint64_t __tmp = (x); \
 		(x) = (y); \
 		(y) = __tmp; \
 	}
 
 #define	DS_REF_MAX	(1ULL << 62)
 
 extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
 
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
  * layer.  If there's a refreservation, that space has already been
  * partially accounted for in our ancestors.
  */
 static int64_t
 parent_delta(dsl_dataset_t *ds, int64_t delta)
 {
 	dsl_dataset_phys_t *ds_phys;
 	uint64_t old_bytes, new_bytes;
 
 	if (ds->ds_reserved == 0)
 		return (delta);
 
 	ds_phys = dsl_dataset_phys(ds);
 	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
 	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
 
 	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
 	return (new_bytes - old_bytes);
 }
 
 void
 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 	int64_t delta;
 
 	dprintf_bp(bp, "ds=%p", ds);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* It could have been compressed away to nothing */
 	if (BP_IS_HOLE(bp))
 		return;
 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 	if (ds == NULL) {
 		dsl_pool_mos_diduse_space(tx->tx_pool,
 		    used, compressed, uncompressed);
 		return;
 	}
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
 	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
 	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
 	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
 	dsl_dataset_phys(ds)->ds_unique_bytes += used;
 
 	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
 		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
 		    B_TRUE;
 	}
 
 	spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
 	if (f != SPA_FEATURE_NONE)
 		ds->ds_feature_activation_needed[f] = B_TRUE;
 
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 	    compressed, uncompressed, tx);
 	dsl_dir_transfer_space(ds->ds_dir, used - delta,
 	    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
 }
 
 int
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
     boolean_t async)
 {
 	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(bp->blk_birth <= tx->tx_txg);
 
 	if (ds == NULL) {
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 		dsl_pool_mos_diduse_space(tx->tx_pool,
 		    -used, -compressed, -uncompressed);
 		return (used);
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
 	ASSERT(!ds->ds_is_snapshot);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
 		mutex_enter(&ds->ds_lock);
 		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
 		    !DS_UNIQUE_IS_ACCURATE(ds));
 		delta = parent_delta(ds, -used);
 		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 		    delta, -compressed, -uncompressed, tx);
 		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
 		    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		if (async) {
 			/*
 			 * We are here as part of zio's write done callback,
 			 * which means we're a zio interrupt thread.  We can't
 			 * call dsl_deadlist_insert() now because it may block
 			 * waiting for I/O.  Instead, put bp on the deferred
 			 * queue and let dsl_pool_sync() finish the job.
 			 */
 			bplist_append(&ds->ds_pending_deadlist, bp);
 		} else {
 			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
 		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object && bp->blk_birth >
 		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
 			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
 		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
 	}
 	mutex_enter(&ds->ds_lock);
 	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
 	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
 	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
 	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
 	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
 	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
 
 	return (used);
 }
 
 uint64_t
 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 {
 	uint64_t trysnap = 0;
 
 	if (ds == NULL)
 		return (0);
 	/*
 	 * The snapshot creation could fail, but that would cause an
 	 * incorrect FALSE return, which would only result in an
 	 * overestimation of the amount of space that an operation would
 	 * consume, which is OK.
 	 *
 	 * There's also a small window where we could miss a pending
 	 * snapshot, because we could set the sync task in the quiescing
 	 * phase.  So this should only be used as a guess.
 	 */
 	if (ds->ds_trysnap_txg >
 	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 		trysnap = ds->ds_trysnap_txg;
 	return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
 }
 
 boolean_t
 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
     uint64_t blk_birth)
 {
 	if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
 	    (bp != NULL && BP_IS_HOLE(bp)))
 		return (B_FALSE);
 
 	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 
 	return (B_TRUE);
 }
 
 static void
 dsl_dataset_evict(void *dbu)
 {
 	dsl_dataset_t *ds = dbu;
 
 	ASSERT(ds->ds_owner == NULL);
 
 	ds->ds_dbuf = NULL;
 
 	unique_remove(ds->ds_fsid_guid);
 
 	if (ds->ds_objset != NULL)
 		dmu_objset_evict(ds->ds_objset);
 
 	if (ds->ds_prev) {
 		dsl_dataset_rele(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
 	bplist_destroy(&ds->ds_pending_deadlist);
 	if (ds->ds_deadlist.dl_os != NULL)
 		dsl_deadlist_close(&ds->ds_deadlist);
 	if (ds->ds_dir)
 		dsl_dir_async_rele(ds->ds_dir, ds);
 
 	ASSERT(!list_link_active(&ds->ds_synced_link));
 
 	list_destroy(&ds->ds_prop_cbs);
 	if (mutex_owned(&ds->ds_lock))
 		mutex_exit(&ds->ds_lock);
 	mutex_destroy(&ds->ds_lock);
 	if (mutex_owned(&ds->ds_opening_lock))
 		mutex_exit(&ds->ds_opening_lock);
 	mutex_destroy(&ds->ds_opening_lock);
 	mutex_destroy(&ds->ds_sendstream_lock);
 	refcount_destroy(&ds->ds_longholds);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
 int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
 	int err;
 	dmu_buf_t *headdbuf;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (ds->ds_snapname[0])
 		return (0);
 	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
 		return (0);
 
 	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
 	    FTAG, &headdbuf);
 	if (err != 0)
 		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
 	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 	dmu_buf_rele(headdbuf, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
 	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
 
 	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 	    value, mt, NULL, 0, NULL);
 	if (err == ENOTSUP && mt == MT_FIRST)
 		err = zap_lookup(mos, snapobj, name, 8, 1, value);
 	return (err);
 }
 
 int
 dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
     boolean_t adj_cnt)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
 	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
 
 	err = zap_remove_norm(mos, snapobj, name, mt, tx);
 	if (err == ENOTSUP && mt == MT_FIRST)
 		err = zap_remove(mos, snapobj, name, tx);
 
 	if (err == 0 && adj_cnt)
 		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 	return (err);
 }
 
 boolean_t
 dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
 {
 	dmu_buf_t *dbuf = ds->ds_dbuf;
 	boolean_t result = B_FALSE;
 
 	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
 	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
 
 		if (ds == dmu_buf_get_user(dbuf))
 			result = B_TRUE;
 		else
 			dmu_buf_rele(dbuf, tag);
 	}
 
 	return (result);
 }
 
 int
 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **dsp)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
 	int err;
 	dmu_object_info_t doi;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 	if (err != 0)
 		return (err);
 
 	/* Make sure dsobj has the correct object type. */
 	dmu_object_info_from_db(dbuf, &doi);
 	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
 		dmu_buf_rele(dbuf, tag);
 		return (SET_ERROR(EINVAL));
 	}
 
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
 		dsl_dataset_t *winner = NULL;
 
 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
 		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
 		refcount_create(&ds->ds_longholds);
 
 		bplist_create(&ds->ds_pending_deadlist);
 		dsl_deadlist_open(&ds->ds_deadlist,
 		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
 
 		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 		    offsetof(dmu_sendarg_t, dsa_link));
 
 		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
 		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
 
 		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
 			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 				if (!(spa_feature_table[f].fi_flags &
 				    ZFEATURE_FLAG_PER_DATASET))
 					continue;
 				err = zap_contains(mos, dsobj,
 				    spa_feature_table[f].fi_guid);
 				if (err == 0) {
 					ds->ds_feature_inuse[f] = B_TRUE;
 				} else {
 					ASSERT3U(err, ==, ENOENT);
 					err = 0;
 				}
 			}
 		}
 
 		err = dsl_dir_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
 		if (err != 0) {
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_opening_lock);
 			mutex_destroy(&ds->ds_sendstream_lock);
 			refcount_destroy(&ds->ds_longholds);
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
 		}
 
 		if (!ds->ds_is_snapshot) {
 			ds->ds_snapname[0] = '\0';
 			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 				err = dsl_dataset_hold_obj(dp,
 				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
 			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
 				int zaperr = zap_lookup(mos, ds->ds_object,
 				    DS_FIELD_BOOKMARK_NAMES,
 				    sizeof (ds->ds_bookmarks), 1,
 				    &ds->ds_bookmarks);
 				if (zaperr != ENOENT)
 					VERIFY0(zaperr);
 			}
 		} else {
 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 				err = dsl_dataset_get_snapname(ds);
 			if (err == 0 &&
 			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
 				err = zap_count(
 				    ds->ds_dir->dd_pool->dp_meta_objset,
 				    dsl_dataset_phys(ds)->ds_userrefs_obj,
 				    &ds->ds_userrefs);
 			}
 		}
 
 		if (err == 0 && !ds->ds_is_snapshot) {
 			err = dsl_prop_get_int_ds(ds,
 			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 			    &ds->ds_reserved);
 			if (err == 0) {
 				err = dsl_prop_get_int_ds(ds,
 				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
 				    &ds->ds_quota);
 			}
 		} else {
 			ds->ds_reserved = ds->ds_quota = 0;
 		}
 
 		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf);
 		if (err == 0)
 			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
 
 		if (err != 0 || winner != NULL) {
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			if (ds->ds_prev)
 				dsl_dataset_rele(ds->ds_prev, ds);
 			dsl_dir_rele(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_opening_lock);
 			mutex_destroy(&ds->ds_sendstream_lock);
 			refcount_destroy(&ds->ds_longholds);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err != 0) {
 				dmu_buf_rele(dbuf, tag);
 				return (err);
 			}
 			ds = winner;
 		} else {
 			ds->ds_fsid_guid =
 			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
 		}
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
 	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
 	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
 	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 	*dsp = ds;
 	return (0);
 }
 
 int
 dsl_dataset_hold(dsl_pool_t *dp, const char *name,
     void *tag, dsl_dataset_t **dsp)
 {
 	dsl_dir_t *dd;
 	const char *snapname;
 	uint64_t obj;
 	int err = 0;
 	dsl_dataset_t *ds;
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
 	if (err != 0)
 		return (err);
 
 	ASSERT(dsl_pool_config_held(dp));
 	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	if (obj != 0)
 		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
 	else
 		err = SET_ERROR(ENOENT);
 
 	/* we may be looking for a snapshot */
 	if (err == 0 && snapname != NULL) {
 		dsl_dataset_t *snap_ds;
 
 		if (*snapname++ != '@') {
 			dsl_dataset_rele(ds, tag);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
 		dprintf("looking for snapshot '%s'\n", snapname);
 		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
 		if (err == 0)
 			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
 		dsl_dataset_rele(ds, tag);
 
 		if (err == 0) {
 			mutex_enter(&snap_ds->ds_lock);
 			if (snap_ds->ds_snapname[0] == 0)
 				(void) strlcpy(snap_ds->ds_snapname, snapname,
 				    sizeof (snap_ds->ds_snapname));
 			mutex_exit(&snap_ds->ds_lock);
 			ds = snap_ds;
 		}
 	}
 	if (err == 0)
 		*dsp = ds;
 	dsl_dir_rele(dd, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
     void *tag, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 	if (err != 0)
 		return (err);
 	if (!dsl_dataset_tryown(*dsp, tag)) {
 		dsl_dataset_rele(*dsp, tag);
 		*dsp = NULL;
 		return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
 int
 dsl_dataset_own(dsl_pool_t *dp, const char *name,
     void *tag, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold(dp, name, tag, dsp);
 	if (err != 0)
 		return (err);
 	if (!dsl_dataset_tryown(*dsp, tag)) {
 		dsl_dataset_rele(*dsp, tag);
 		return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
 /*
  * See the comment above dsl_pool_hold() for details.  In summary, a long
  * hold is used to prevent destruction of a dataset while the pool hold
  * is dropped, allowing other concurrent operations (e.g. spa_sync()).
  *
  * The dataset and pool must be held when this function is called.  After it
  * is called, the pool hold may be released while the dataset is still held
  * and accessed.
  */
 void
 dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
 {
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	(void) refcount_add(&ds->ds_longholds, tag);
 }
 
 void
 dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
 {
 	(void) refcount_remove(&ds->ds_longholds, tag);
 }
 
 /* Return B_TRUE if there are any long holds on this dataset. */
 boolean_t
 dsl_dataset_long_held(dsl_dataset_t *ds)
 {
 	return (!refcount_is_zero(&ds->ds_longholds));
 }
 
 void
 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 {
 	if (ds == NULL) {
 		(void) strcpy(name, "mos");
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			(void) strcat(name, "@");
 			/*
 			 * We use a "recursive" mutex so that we
 			 * can call dprintf_ds() with ds_lock held.
 			 */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				mutex_enter(&ds->ds_lock);
 				(void) strcat(name, ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
 			} else {
 				(void) strcat(name, ds->ds_snapname);
 			}
 		}
 	}
 }
 
 void
 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 {
 	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
 	ASSERT3P(ds->ds_owner, ==, tag);
 	ASSERT(ds->ds_dbuf != NULL);
 
 	mutex_enter(&ds->ds_lock);
 	ds->ds_owner = NULL;
 	mutex_exit(&ds->ds_lock);
 	dsl_dataset_long_rele(ds, tag);
 	dsl_dataset_rele(ds, tag);
 }
 
 boolean_t
 dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
 {
 	boolean_t gotit = FALSE;
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
 		ds->ds_owner = tag;
 		dsl_dataset_long_hold(ds, tag);
 		gotit = TRUE;
 	}
 	mutex_exit(&ds->ds_lock);
 	return (gotit);
 }
 
 boolean_t
 dsl_dataset_has_owner(dsl_dataset_t *ds)
 {
 	boolean_t rv;
 	mutex_enter(&ds->ds_lock);
 	rv = (ds->ds_owner != NULL);
 	mutex_exit(&ds->ds_lock);
 	return (rv);
 }
 
 static void
 dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
 	uint64_t zero = 0;
 
 	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
 
 	spa_feature_incr(spa, f, tx);
 	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
 
 	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
 	    sizeof (zero), 1, &zero, tx));
 }
 
 void
 dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
 
 	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
 
 	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
 	spa_feature_decr(spa, f, tx);
 }
 
 uint64_t
 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dd->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (origin == NULL)
 		origin = dp->dp_origin_snap;
 
 	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_flags = flags;
 	dsphys->ds_fsid_guid = unique_create();
 	do {
 		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 		    sizeof (dsphys->ds_guid));
 	} while (dsphys->ds_guid == 0);
 	dsphys->ds_snapnames_zapobj =
 	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 	    DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 
 	if (origin == NULL) {
 		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 	} else {
 		dsl_dataset_t *ohds; /* head of the origin snapshot */
 
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
 		    dsl_dataset_phys(origin)->ds_creation_txg;
 		dsphys->ds_referenced_bytes =
 		    dsl_dataset_phys(origin)->ds_referenced_bytes;
 		dsphys->ds_compressed_bytes =
 		    dsl_dataset_phys(origin)->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
 		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
 		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
 
 		/*
 		 * Inherit flags that describe the dataset's contents
 		 * (INCONSISTENT) or properties (Case Insensitive).
 		 */
 		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
 		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
 
 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 			if (origin->ds_feature_inuse[f])
 				dsl_dataset_activate_feature(dsobj, f, tx);
 		}
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		dsl_dataset_phys(origin)->ds_num_children++;
 
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
 		    FTAG, &ohds));
 		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 		dsl_dataset_rele(ohds, FTAG);
 
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
 				dsl_dataset_phys(origin)->ds_next_clones_obj =
 				    zap_create(mos,
 				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY0(zap_add_int(mos,
 			    dsl_dataset_phys(origin)->ds_next_clones_obj,
 			    dsobj, tx));
 		}
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 				dsl_dir_phys(origin->ds_dir)->dd_clones =
 				    zap_create(mos,
 				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY0(zap_add_int(mos,
 			    dsl_dir_phys(origin->ds_dir)->dd_clones,
 			    dsobj, tx));
 		}
 	}
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
 
 	return (dsobj);
 }
 
 static void
 dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	objset_t *os;
 
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 	bzero(&os->os_zil_header, sizeof (os->os_zil_header));
 	dsl_dataset_dirty(ds, tx);
 }
 
 uint64_t
 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = pdd->dd_pool;
 	uint64_t dsobj, ddobj;
 	dsl_dir_t *dd;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(lastname[0] != '@');
 
 	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
 
 	dsobj = dsl_dataset_create_sync_dd(dd, origin,
 	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
 
 	dsl_deleg_set_create_perms(dd, tx, cr);
 
 	/*
 	 * Since we're creating a new node we know it's a leaf, so we can
 	 * initialize the counts if the limit feature is active.
 	 */
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
 		uint64_t cnt = 0;
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 
 		dsl_dir_zapify(dd, tx);
 		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
 		    sizeof (cnt), 1, &cnt, tx));
 		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
 		    sizeof (cnt), 1, &cnt, tx));
 	}
 
 	dsl_dir_rele(dd, FTAG);
 
 	/*
 	 * If we are creating a clone, make sure we zero out any stale
 	 * data from the origin snapshots zil header.
 	 */
 	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
 		dsl_dataset_t *ds;
 
 		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 		dsl_dataset_zero_zil(ds, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	return (dsobj);
 }
 
 #ifdef __FreeBSD__
 /* FreeBSD ioctl compat begin */
 struct destroyarg {
 	nvlist_t *nvl;
 	const char *snapname;
 };
 
 static int
 dsl_check_snap_cb(const char *name, void *arg)
 {
 	struct destroyarg *da = arg;
 	dsl_dataset_t *ds;
 	char *dsname;
 
 	dsname = kmem_asprintf("%s@%s", name, da->snapname);
 	fnvlist_add_boolean(da->nvl, dsname);
 	kmem_free(dsname, strlen(dsname) + 1);
 
 	return (0);
 }
 
 int
 dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
     nvlist_t *snaps)
 {
 	struct destroyarg *da;
 	int err;
 
 	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
 	da->nvl = snaps;
 	da->snapname = snapname;
 	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
 	    DS_FIND_CHILDREN);
 	kmem_free(da, sizeof (struct destroyarg));
 
 	return (err);
 }
 /* FreeBSD ioctl compat end */
 #endif /* __FreeBSD__ */
 
 /*
  * The unique space in the head dataset can be calculated by subtracting
  * the space used in the most recent snapshot, that is still being used
  * in this file system, from the space currently in use.  To figure out
  * the space in the most recent snapshot still in use, we need to take
  * the total space used in the snapshot and subtract out the space that
  * has been freed up since the snapshot was taken.
  */
 void
 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 {
 	uint64_t mrs_used;
 	uint64_t dlused, dlcomp, dluncomp;
 
 	ASSERT(!ds->ds_is_snapshot);
 
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
 		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
 	else
 		mrs_used = 0;
 
 	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
 	ASSERT3U(dlused, <=, mrs_used);
 	dsl_dataset_phys(ds)->ds_unique_bytes =
 	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
 	    SPA_VERSION_UNIQUE_ACCURATE)
 		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
 void
 dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
     dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t count;
 	int err;
 
 	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
 	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 	    obj, tx);
 	/*
 	 * The err should not be ENOENT, but a bug in a previous version
 	 * of the code could cause upgrade_clones_cb() to not set
 	 * ds_next_snap_obj when it should, leading to a missing entry.
 	 * If we knew that the pool was created after
 	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
 	 * ENOENT.  However, at least we can check that we don't have
 	 * too many entries in the next_clones_obj even after failing to
 	 * remove this one.
 	 */
 	if (err != ENOENT)
 		VERIFY0(err);
 	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 	    &count));
 	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
 }
 
 
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
 	return (&dsl_dataset_phys(ds)->ds_bp);
 }
 
 void
 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* If it's the meta-objset, set dp_meta_rootbp */
 	if (ds == NULL) {
 		tx->tx_pool->dp_meta_rootbp = *bp;
 	} else {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_bp = *bp;
 	}
 }
 
 spa_t *
 dsl_dataset_get_spa(dsl_dataset_t *ds)
 {
 	return (ds->ds_dir->dd_pool->dp_spa);
 }
 
 void
 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp;
 
 	if (ds == NULL) /* this is the meta-objset */
 		return;
 
 	ASSERT(ds->ds_objset != NULL);
 
 	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
 		panic("dirtying snapshot!");
 
 	dp = ds->ds_dir->dd_pool;
 
 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, ds);
 	}
 }
 
 boolean_t
 dsl_dataset_is_dirty(dsl_dataset_t *ds)
 {
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
 		    ds, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static int
 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	uint64_t asize;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	/*
 	 * If there's an fs-only reservation, any blocks that might become
 	 * owned by the snapshot dataset must be accommodated by space
 	 * outside of the reservation.
 	 */
 	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
 	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
 	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
 		return (SET_ERROR(ENOSPC));
 
 	/*
 	 * Propagate any reserved space for this snapshot to other
 	 * snapshot checks in this sync group.
 	 */
 	if (asize > 0)
 		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
 
 	return (0);
 }
 
 typedef struct dsl_dataset_snapshot_arg {
 	nvlist_t *ddsa_snaps;
 	nvlist_t *ddsa_props;
 	nvlist_t *ddsa_errors;
 	cred_t *ddsa_cr;
 } dsl_dataset_snapshot_arg_t;
 
 int
 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
 {
 	int error;
 	uint64_t value;
 
 	ds->ds_trysnap_txg = tx->tx_txg;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	/*
 	 * We don't allow multiple snapshots of the same txg.  If there
 	 * is already one, try again.
 	 */
 	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
 		return (SET_ERROR(EAGAIN));
 
 	/*
 	 * Check for conflicting snapshot name.
 	 */
 	error = dsl_dataset_snap_lookup(ds, snapname, &value);
 	if (error == 0)
 		return (SET_ERROR(EEXIST));
 	if (error != ENOENT)
 		return (error);
 
 	/*
 	 * We don't allow taking snapshots of inconsistent datasets, such as
 	 * those into which we are currently receiving.  However, if we are
 	 * creating this snapshot as part of a receive, this check will be
 	 * executed atomically with respect to the completion of the receive
 	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
 	 * case we ignore this, knowing it will be fixed up for us shortly in
 	 * dmu_recv_end_sync().
 	 */
 	if (!recv && DS_IS_INCONSISTENT(ds))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Skip the check for temporary snapshots or if we have already checked
 	 * the counts in dsl_dataset_snapshot_check. This means we really only
 	 * check the count here when we're receiving a stream.
 	 */
 	if (cnt != 0 && cr != NULL) {
 		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
 		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
 		if (error != 0)
 			return (error);
 	}
 
 	error = dsl_dataset_snapshot_reserve_space(ds, tx);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 static int
 dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_arg_t *ddsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 	int rv = 0;
 
 	/*
 	 * Pre-compute how many total new snapshots will be created for each
 	 * level in the tree and below. This is needed for validating the
 	 * snapshot limit when either taking a recursive snapshot or when
 	 * taking multiple snapshots.
 	 *
 	 * The problem is that the counts are not actually adjusted when
 	 * we are checking, only when we finally sync. For a single snapshot,
 	 * this is easy, the count will increase by 1 at each node up the tree,
 	 * but its more complicated for the recursive/multiple snapshot case.
 	 *
 	 * The dsl_fs_ss_limit_check function does recursively check the count
 	 * at each level up the tree but since it is validating each snapshot
 	 * independently we need to be sure that we are validating the complete
 	 * count for the entire set of snapshots. We do this by rolling up the
 	 * counts for each component of the name into an nvlist and then
 	 * checking each of those cases with the aggregated count.
 	 *
 	 * This approach properly handles not only the recursive snapshot
 	 * case (where we get all of those on the ddsa_snaps list) but also
 	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
 	 * validate the limit on 'a' using a count of 2).
 	 *
 	 * We validate the snapshot names in the third loop and only report
 	 * name errors once.
 	 */
 	if (dmu_tx_is_syncing(tx)) {
 		nvlist_t *cnt_track = NULL;
 		cnt_track = fnvlist_alloc();
 
 		/* Rollup aggregated counts into the cnt_track list */
 		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 		    pair != NULL;
 		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 			char *pdelim;
 			uint64_t val;
 			char nm[MAXPATHLEN];
 
 			(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
 			pdelim = strchr(nm, '@');
 			if (pdelim == NULL)
 				continue;
 			*pdelim = '\0';
 
 			do {
 				if (nvlist_lookup_uint64(cnt_track, nm,
 				    &val) == 0) {
 					/* update existing entry */
 					fnvlist_add_uint64(cnt_track, nm,
 					    val + 1);
 				} else {
 					/* add to list */
 					fnvlist_add_uint64(cnt_track, nm, 1);
 				}
 
 				pdelim = strrchr(nm, '/');
 				if (pdelim != NULL)
 					*pdelim = '\0';
 			} while (pdelim != NULL);
 		}
 
 		/* Check aggregated counts at each level */
 		for (pair = nvlist_next_nvpair(cnt_track, NULL);
 		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
 			int error = 0;
 			char *name;
 			uint64_t cnt = 0;
 			dsl_dataset_t *ds;
 
 			name = nvpair_name(pair);
 			cnt = fnvpair_value_uint64(pair);
 			ASSERT(cnt > 0);
 
 			error = dsl_dataset_hold(dp, name, FTAG, &ds);
 			if (error == 0) {
 				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
 				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
 				    ddsa->ddsa_cr);
 				dsl_dataset_rele(ds, FTAG);
 			}
 
 			if (error != 0) {
 				if (ddsa->ddsa_errors != NULL)
 					fnvlist_add_int32(ddsa->ddsa_errors,
 					    name, error);
 				rv = error;
 				/* only report one error for this check */
 				break;
 			}
 		}
 		nvlist_free(cnt_track);
 	}
 
 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		int error = 0;
 		dsl_dataset_t *ds;
 		char *name, *atp;
 		char dsname[MAXNAMELEN];
 
 		name = nvpair_name(pair);
 		if (strlen(name) >= MAXNAMELEN)
 			error = SET_ERROR(ENAMETOOLONG);
 		if (error == 0) {
 			atp = strchr(name, '@');
 			if (atp == NULL)
 				error = SET_ERROR(EINVAL);
 			if (error == 0)
 				(void) strlcpy(dsname, name, atp - name + 1);
 		}
 		if (error == 0)
 			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 		if (error == 0) {
 			/* passing 0/NULL skips dsl_fs_ss_limit_check */
 			error = dsl_dataset_snapshot_check_impl(ds,
 			    atp + 1, tx, B_FALSE, 0, NULL);
 			dsl_dataset_rele(ds, FTAG);
 		}
 
 		if (error != 0) {
 			if (ddsa->ddsa_errors != NULL) {
 				fnvlist_add_int32(ddsa->ddsa_errors,
 				    name, error);
 			}
 			rv = error;
 		}
 	}
 
 	return (rv);
 }
 
 void
 dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx)
 {
 	static zil_header_t zero_zil;
 
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, crtxg;
 	objset_t *mos = dp->dp_meta_objset;
 	objset_t *os;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	/*
 	 * If we are on an old pool, the zil must not be active, in which
 	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
 	 */
 	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
 	    dmu_objset_from_ds(ds, &os) != 0 ||
 	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
 	    sizeof (zero_zil)) == 0);
 
 	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
 
 	/*
 	 * The origin's ds_creation_txg has to be < TXG_INITIAL
 	 */
 	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
 		crtxg = 1;
 	else
 		crtxg = tx->tx_txg;
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	do {
 		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 		    sizeof (dsphys->ds_guid));
 	} while (dsphys->ds_guid == 0);
 	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = crtxg;
 	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
 	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
 	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
 	dsphys->ds_uncompressed_bytes =
 	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
 	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
 	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
 	dmu_buf_rele(dbuf, FTAG);
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (ds->ds_feature_inuse[f])
 			dsl_dataset_activate_feature(dsobj, f, tx);
 	}
 
 	ASSERT3U(ds->ds_prev != 0, ==,
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
 		uint64_t next_clones_obj =
 		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
 		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object ||
 		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
 			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
 			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
 		} else if (next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    dsphys->ds_next_snap_obj, tx);
 			VERIFY0(zap_add_int(mos,
 			    next_clones_obj, dsobj, tx));
 		}
 	}
 
 	/*
 	 * If we have a reference-reservation on this dataset, we will
 	 * need to increase the amount of refreservation being charged
 	 * since our unique space is going to zero.
 	 */
 	if (ds->ds_reserved) {
 		int64_t delta;
 		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
 		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
 		    ds->ds_reserved);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
 		    delta, 0, 0, tx);
 	}
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_deadlist_obj =
 	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_open(&ds->ds_deadlist, mos,
 	    dsl_dataset_phys(ds)->ds_deadlist_obj);
 	dsl_deadlist_add_key(&ds->ds_deadlist,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 
 	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
 	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
 	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
 	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
 	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 	    snapname, 8, 1, &dsobj, tx));
 
 	if (ds->ds_prev)
 		dsl_dataset_rele(ds->ds_prev, ds);
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
 
 	dsl_scan_ds_snapshotted(ds, tx);
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
 	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
 }
 
 static void
 dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_arg_t *ddsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 
 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		dsl_dataset_t *ds;
 		char *name, *atp;
 		char dsname[MAXNAMELEN];
 
 		name = nvpair_name(pair);
 		atp = strchr(name, '@');
 		(void) strlcpy(dsname, name, atp - name + 1);
 		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
 
 		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
 		if (ddsa->ddsa_props != NULL) {
 			dsl_props_set_sync_impl(ds->ds_prev,
 			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
 		}
 		dsl_dataset_rele(ds, FTAG);
 	}
 }
 
 /*
  * The snapshots must all be in the same pool.
  * All-or-nothing: if there are any failures, nothing will be modified.
  */
 int
 dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 {
 	dsl_dataset_snapshot_arg_t ddsa;
 	nvpair_t *pair;
 	boolean_t needsuspend;
 	int error;
 	spa_t *spa;
 	char *firstname;
 	nvlist_t *suspended = NULL;
 
 	pair = nvlist_next_nvpair(snaps, NULL);
 	if (pair == NULL)
 		return (0);
 	firstname = nvpair_name(pair);
 
 	error = spa_open(firstname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
 	spa_close(spa, FTAG);
 
 	if (needsuspend) {
 		suspended = fnvlist_alloc();
 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(snaps, pair)) {
 			char fsname[MAXNAMELEN];
 			char *snapname = nvpair_name(pair);
 			char *atp;
 			void *cookie;
 
 			atp = strchr(snapname, '@');
 			if (atp == NULL) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 			(void) strlcpy(fsname, snapname, atp - snapname + 1);
 
 			error = zil_suspend(fsname, &cookie);
 			if (error != 0)
 				break;
 			fnvlist_add_uint64(suspended, fsname,
 			    (uintptr_t)cookie);
 		}
 	}
 
 	ddsa.ddsa_snaps = snaps;
 	ddsa.ddsa_props = props;
 	ddsa.ddsa_errors = errors;
 	ddsa.ddsa_cr = CRED();
 
 	if (error == 0) {
 		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
 		    dsl_dataset_snapshot_sync, &ddsa,
 		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
 	}
 
 	if (suspended != NULL) {
 		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(suspended, pair)) {
 			zil_resume((void *)(uintptr_t)
 			    fnvpair_value_uint64(pair));
 		}
 		fnvlist_free(suspended);
 	}
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	if (error == 0) {
 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(snaps, pair)) {
 			char *snapname = nvpair_name(pair);
 			zvol_create_minors(snapname);
 		}
 	}
 #endif
 #endif
 	return (error);
 }
 
 typedef struct dsl_dataset_snapshot_tmp_arg {
 	const char *ddsta_fsname;
 	const char *ddsta_snapname;
 	minor_t ddsta_cleanup_minor;
 	const char *ddsta_htag;
 } dsl_dataset_snapshot_tmp_arg_t;
 
 static int
 dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	/* NULL cred means no limit check for tmp snapshot */
 	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
 	    tx, B_FALSE, 0, NULL);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
 	    B_TRUE, tx);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
 
 	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
 	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
 	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
 	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
 
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
     minor_t cleanup_minor, const char *htag)
 {
 	dsl_dataset_snapshot_tmp_arg_t ddsta;
 	int error;
 	spa_t *spa;
 	boolean_t needsuspend;
 	void *cookie;
 
 	ddsta.ddsta_fsname = fsname;
 	ddsta.ddsta_snapname = snapname;
 	ddsta.ddsta_cleanup_minor = cleanup_minor;
 	ddsta.ddsta_htag = htag;
 
 	error = spa_open(fsname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
 	spa_close(spa, FTAG);
 
 	if (needsuspend) {
 		error = zil_suspend(fsname, &cookie);
 		if (error != 0)
 			return (error);
 	}
 
 	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
 	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
 
 	if (needsuspend)
 		zil_resume(cookie);
 	return (error);
 }
 
 
 void
 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_objset != NULL);
 	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
 
 	/*
 	 * in case we had to change ds_fsid_guid when we opened it,
 	 * sync it out now.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
 
 	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
 		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
 		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
 		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
 		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
 		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
 		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
 		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
 		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
 		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
 		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
 		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
 		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
 	}
 
 	dmu_objset_sync(ds->ds_objset, zio, tx);
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (ds->ds_feature_activation_needed[f]) {
 			if (ds->ds_feature_inuse[f])
 				continue;
 			dsl_dataset_activate_feature(ds->ds_object, f, tx);
 			ds->ds_feature_inuse[f] = B_TRUE;
 		}
 	}
 }
 
 static void
 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	uint64_t count = 0;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	nvlist_t *propval = fnvlist_alloc();
 	nvlist_t *val = fnvlist_alloc();
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 
 	/*
 	 * There may be missing entries in ds_next_clones_obj
 	 * due to a bug in a previous version of the code.
 	 * Only trust it if it has the right number of entries.
 	 */
 	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 		    &count));
 	}
 	if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
 		goto fail;
 	for (zap_cursor_init(&zc, mos,
 	    dsl_dataset_phys(ds)->ds_next_clones_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
 		char buf[ZFS_MAXNAMELEN];
 		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 		    za.za_first_integer, FTAG, &clone));
 		dsl_dir_name(clone->ds_dir, buf);
 		fnvlist_add_boolean(val, buf);
 		dsl_dataset_rele(clone, FTAG);
 	}
 	zap_cursor_fini(&zc);
 	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
 	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
 fail:
 	nvlist_free(val);
 	nvlist_free(propval);
 }
 
 static void
 get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	if (dsl_dataset_has_resume_receive_state(ds)) {
 		char *str;
 		void *packed;
 		uint8_t *compressed;
 		uint64_t val;
 		nvlist_t *token_nv = fnvlist_alloc();
 		size_t packed_size, compressed_size;
 
 		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
 			fnvlist_add_uint64(token_nv, "fromguid", val);
 		}
 		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
 			fnvlist_add_uint64(token_nv, "object", val);
 		}
 		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
 			fnvlist_add_uint64(token_nv, "offset", val);
 		}
 		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
 			fnvlist_add_uint64(token_nv, "bytes", val);
 		}
 		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
 			fnvlist_add_uint64(token_nv, "toguid", val);
 		}
 		char buf[256];
 		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
 			fnvlist_add_string(token_nv, "toname", buf);
 		}
 		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
 		    DS_FIELD_RESUME_EMBEDOK) == 0) {
 			fnvlist_add_boolean(token_nv, "embedok");
 		}
 		packed = fnvlist_pack(token_nv, &packed_size);
 		fnvlist_free(token_nv);
 		compressed = kmem_alloc(packed_size, KM_SLEEP);
 
 		compressed_size = gzip_compress(packed, compressed,
 		    packed_size, packed_size, 6);
 
 		zio_cksum_t cksum;
 		fletcher_4_native(compressed, compressed_size, NULL, &cksum);
 
 		str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
 		for (int i = 0; i < compressed_size; i++) {
 			(void) sprintf(str + i * 2, "%02x", compressed[i]);
 		}
 		str[compressed_size * 2] = '\0';
 		char *propval = kmem_asprintf("%u-%llx-%llx-%s",
 		    ZFS_SEND_RESUME_TOKEN_VERSION,
 		    (longlong_t)cksum.zc_word[0],
 		    (longlong_t)packed_size, str);
 		dsl_prop_nvlist_add_string(nv,
 		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
 		kmem_free(packed, packed_size);
 		kmem_free(str, compressed_size * 2 + 1);
 		kmem_free(compressed, packed_size);
 		strfree(propval);
 	}
 }
 
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	uint64_t refd, avail, uobjs, aobjs, ratio;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
 	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
 	    dsl_dataset_phys(ds)->ds_compressed_bytes);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
 	    dsl_dataset_phys(ds)->ds_uncompressed_bytes);
 
 	if (ds->ds_is_snapshot) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 		    dsl_dataset_phys(ds)->ds_unique_bytes);
 		get_clones_stat(ds, nv);
 	} else {
 		if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
 			char buf[MAXNAMELEN];
 			dsl_dataset_name(ds->ds_prev, buf);
 			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
 		}
 
 		dsl_dir_stats(ds->ds_dir, nv);
 	}
 
 	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
 	    dsl_dataset_phys(ds)->ds_creation_time);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
 	    dsl_dataset_phys(ds)->ds_creation_txg);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
 	    ds->ds_quota);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
 	    ds->ds_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
 	    dsl_dataset_phys(ds)->ds_guid);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
 	    dsl_dataset_phys(ds)->ds_unique_bytes);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
 	    ds->ds_object);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
 	    ds->ds_userrefs);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
 	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		uint64_t written, comp, uncomp;
 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
 		dsl_dataset_t *prev;
 
 		int err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err == 0) {
 			err = dsl_dataset_space_written(prev, ds, &written,
 			    &comp, &uncomp);
 			dsl_dataset_rele(prev, FTAG);
 			if (err == 0) {
 				dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
 				    written);
 			}
 		}
 	}
 
 	if (!dsl_dataset_is_snapshot(ds)) {
 		/*
 		 * A failed "newfs" (e.g. full) resumable receive leaves
 		 * the stats set on this dataset.  Check here for the prop.
 		 */
 		get_receive_resume_stats(ds, nv);
 
 		/*
 		 * A failed incremental resumable receive leaves the
 		 * stats set on our child named "%recv".  Check the child
 		 * for the prop.
 		 */
 		char recvname[ZFS_MAXNAMELEN];
 		dsl_dataset_t *recv_ds;
 		dsl_dataset_name(ds, recvname);
 		(void) strcat(recvname, "/");
 		(void) strcat(recvname, recv_clone_name);
 		if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
 			get_receive_resume_stats(recv_ds, nv);
 			dsl_dataset_rele(recv_ds, FTAG);
 		}
 	}
 }
 
 void
 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	ASSERT(dsl_pool_config_held(dp));
 
 	stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
 	stat->dds_inconsistent =
 	    dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
 	stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
 	stat->dds_origin[0] = '\0';
 	if (ds->ds_is_snapshot) {
 		stat->dds_is_snapshot = B_TRUE;
 		stat->dds_num_clones =
 		    dsl_dataset_phys(ds)->ds_num_children - 1;
 	} else {
 		stat->dds_is_snapshot = B_FALSE;
 		stat->dds_num_clones = 0;
 
 		if (dsl_dir_is_clone(ds->ds_dir)) {
 			dsl_dataset_t *ods;
 
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
 			    FTAG, &ods));
 			dsl_dataset_name(ods, stat->dds_origin);
 			dsl_dataset_rele(ods, FTAG);
 		}
 	}
 }
 
 uint64_t
 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
 {
 	return (ds->ds_fsid_guid);
 }
 
 void
 dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
 	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
 		*availbytesp +=
 		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
 	if (ds->ds_quota != 0) {
 		/*
 		 * Adjust available bytes according to refquota
 		 */
 		if (*refdbytesp < ds->ds_quota)
 			*availbytesp = MIN(*availbytesp,
 			    ds->ds_quota - *refdbytesp);
 		else
 			*availbytesp = 0;
 	}
 	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
 boolean_t
 dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	ASSERT(dsl_pool_config_held(dp));
 	if (snap == NULL)
 		return (B_FALSE);
 	if (dsl_dataset_phys(ds)->ds_bp.blk_birth >
 	    dsl_dataset_phys(snap)->ds_creation_txg) {
 		objset_t *os, *os_snap;
 		/*
 		 * It may be that only the ZIL differs, because it was
 		 * reset in the head.  Don't count that as being
 		 * modified.
 		 */
 		if (dmu_objset_from_ds(ds, &os) != 0)
 			return (B_TRUE);
 		if (dmu_objset_from_ds(snap, &os_snap) != 0)
 			return (B_TRUE);
 		return (bcmp(&os->os_phys->os_meta_dnode,
 		    &os_snap->os_phys->os_meta_dnode,
 		    sizeof (os->os_phys->os_meta_dnode)) != 0);
 	}
 	return (B_FALSE);
 }
 
 typedef struct dsl_dataset_rename_snapshot_arg {
 	const char *ddrsa_fsname;
 	const char *ddrsa_oldsnapname;
 	const char *ddrsa_newsnapname;
 	boolean_t ddrsa_recursive;
 	dmu_tx_t *ddrsa_tx;
 } dsl_dataset_rename_snapshot_arg_t;
 
 /* ARGSUSED */
 static int
 dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
     dsl_dataset_t *hds, void *arg)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	int error;
 	uint64_t val;
 
 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
 	if (error != 0) {
 		/* ignore nonexistent snapshots */
 		return (error == ENOENT ? 0 : error);
 	}
 
 	/* new name should not exist */
 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
 	if (error == 0)
 		error = SET_ERROR(EEXIST);
 	else if (error == ENOENT)
 		error = 0;
 
 	/* dataset name + 1 for the "@" + the new snapshot name must fit */
 	if (dsl_dir_namelen(hds->ds_dir) + 1 +
 	    strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
 		error = SET_ERROR(ENAMETOOLONG);
 
 	return (error);
 }
 
 static int
 dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
 	if (error != 0)
 		return (error);
 
 	if (ddrsa->ddrsa_recursive) {
 		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
 		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
 		    DS_FIND_CHILDREN);
 	} else {
 		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
 	}
 	dsl_dataset_rele(hds, FTAG);
 	return (error);
 }
 
 static int
 dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
     dsl_dataset_t *hds, void *arg)
 {
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	char *oldname, *newname;
 #endif
 #endif
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	dsl_dataset_t *ds;
 	uint64_t val;
 	dmu_tx_t *tx = ddrsa->ddrsa_tx;
 	int error;
 
 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
 	ASSERT(error == 0 || error == ENOENT);
 	if (error == ENOENT) {
 		/* ignore nonexistent snapshots */
 		return (0);
 	}
 
 	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
 
 	/* log before we change the name */
 	spa_history_log_internal_ds(ds, "rename", tx,
 	    "-> @%s", ddrsa->ddrsa_newsnapname);
 
 	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
 	    B_FALSE));
 	mutex_enter(&ds->ds_lock);
 	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
 	mutex_exit(&ds->ds_lock);
 	VERIFY0(zap_add(dp->dp_meta_objset,
 	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
 	    ddrsa->ddrsa_oldsnapname);
 	snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
 	    ddrsa->ddrsa_newsnapname);
 	zfsvfs_update_fromname(oldname, newname);
 	zvol_rename_minors(oldname, newname);
 	kmem_free(newname, MAXPATHLEN);
 	kmem_free(oldname, MAXPATHLEN);
 #endif
 #endif
 	dsl_dataset_rele(ds, FTAG);
 
 	return (0);
 }
 
 static void
 dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
 	ddrsa->ddrsa_tx = tx;
 	if (ddrsa->ddrsa_recursive) {
 		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
 		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
 		    DS_FIND_CHILDREN));
 	} else {
 		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
 	}
 	dsl_dataset_rele(hds, FTAG);
 }
 
 int
 dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive)
 {
 	dsl_dataset_rename_snapshot_arg_t ddrsa;
 
 	ddrsa.ddrsa_fsname = fsname;
 	ddrsa.ddrsa_oldsnapname = oldsnapname;
 	ddrsa.ddrsa_newsnapname = newsnapname;
 	ddrsa.ddrsa_recursive = recursive;
 
 	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
 	    dsl_dataset_rename_snapshot_sync, &ddrsa,
 	    1, ZFS_SPACE_CHECK_RESERVED));
 }
 
 /*
  * If we're doing an ownership handoff, we need to make sure that there is
  * only one long hold on the dataset.  We're not allowed to change anything here
  * so we don't permanently release the long hold or regular hold here.  We want
  * to do this only when syncing to avoid the dataset unexpectedly going away
  * when we release the long hold.
  */
 static int
 dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
 {
 	boolean_t held;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	if (owner != NULL) {
 		VERIFY3P(ds->ds_owner, ==, owner);
 		dsl_dataset_long_rele(ds, owner);
 	}
 
 	held = dsl_dataset_long_held(ds);
 
 	if (owner != NULL)
 		dsl_dataset_long_hold(ds, owner);
 
 	if (held)
 		return (SET_ERROR(EBUSY));
 
 	return (0);
 }
 
 typedef struct dsl_dataset_rollback_arg {
 	const char *ddra_fsname;
 	void *ddra_owner;
 	nvlist_t *ddra_result;
 } dsl_dataset_rollback_arg_t;
 
 static int
 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int64_t unused_refres_delta;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	/* must not be a snapshot */
 	if (ds->ds_is_snapshot) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* must have a most recent snapshot */
 	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* must not have any bookmarks after the most recent snapshot */
 	nvlist_t *proprequest = fnvlist_alloc();
 	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
 	nvlist_t *bookmarks = fnvlist_alloc();
 	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
 	fnvlist_free(proprequest);
 	if (error != 0)
 		return (error);
 	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
 		nvlist_t *valuenv =
 		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
 		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
 		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
 		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 			fnvlist_free(bookmarks);
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EEXIST));
 		}
 	}
 	fnvlist_free(bookmarks);
 
 	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Check if the snap we are rolling back to uses more than
 	 * the refquota.
 	 */
 	if (ds->ds_quota != 0 &&
 	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * When we do the clone swap, we will temporarily use more space
 	 * due to the refreservation (the head will no longer have any
 	 * unique space, so the entire amount of the refreservation will need
 	 * to be free).  We will immediately destroy the clone, freeing
 	 * this space, but the freeing happens over many txg's.
 	 */
 	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
 	    dsl_dataset_phys(ds)->ds_unique_bytes);
 
 	if (unused_refres_delta > 0 &&
 	    unused_refres_delta >
 	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds, *clone;
 	uint64_t cloneobj;
 	char namebuf[ZFS_MAXNAMELEN];
 
 	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
 
 	dsl_dataset_name(ds->ds_prev, namebuf);
 	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
 
 	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
 	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
 
 	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
 	dsl_dataset_zero_zil(ds, tx);
 
 	dsl_destroy_head_sync_impl(clone, tx);
 
 	dsl_dataset_rele(clone, FTAG);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * Rolls back the given filesystem or volume to the most recent snapshot.
  * The name of the most recent snapshot will be returned under key "target"
  * in the result nvlist.
  *
  * If owner != NULL:
  * - The existing dataset MUST be owned by the specified owner at entry
  * - Upon return, dataset will still be held by the same owner, whether we
  *   succeed or not.
  *
  * This mode is required any time the existing filesystem is mounted.  See
  * notes above zfs_suspend_fs() for further details.
  */
 int
 dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
 {
 	dsl_dataset_rollback_arg_t ddra;
 
 	ddra.ddra_fsname = fsname;
 	ddra.ddra_owner = owner;
 	ddra.ddra_result = result;
 
 	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
 	    dsl_dataset_rollback_sync, &ddra,
 	    1, ZFS_SPACE_CHECK_RESERVED));
 }
 
 struct promotenode {
 	list_node_t link;
 	dsl_dataset_t *ds;
 };
 
 typedef struct dsl_dataset_promote_arg {
 	const char *ddpa_clonename;
 	dsl_dataset_t *ddpa_clone;
 	list_t shared_snaps, origin_snaps, clone_snaps;
 	dsl_dataset_t *origin_origin; /* origin of the origin */
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
 	char *err_ds;
 	cred_t *cr;
 } dsl_dataset_promote_arg_t;
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
 static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
     void *tag);
 static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
 
 static int
 dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_promote_arg_t *ddpa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	struct promotenode *snap;
 	dsl_dataset_t *origin_ds;
 	int err;
 	uint64_t unused;
 	uint64_t ss_mv_cnt;
 	size_t max_snap_len;
 
 	err = promote_hold(ddpa, dp, FTAG);
 	if (err != 0)
 		return (err);
 
 	hds = ddpa->ddpa_clone;
 	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
 
 	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
 		promote_rele(ddpa, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/*
 	 * Compute and check the amount of space to transfer.  Since this is
 	 * so expensive, don't do the preliminary check.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		promote_rele(ddpa, FTAG);
 		return (0);
 	}
 
 	snap = list_head(&ddpa->shared_snaps);
 	origin_ds = snap->ds;
 
 	/* compute origin's new unique space */
 	snap = list_tail(&ddpa->clone_snaps);
 	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
 	    origin_ds->ds_object);
 	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
 	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
 	    &ddpa->unique, &unused, &unused);
 
 	/*
 	 * Walk the snapshots that we are moving
 	 *
 	 * Compute space to transfer.  Consider the incremental changes
 	 * to used by each snapshot:
 	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
 	 * So each snapshot gave birth to:
 	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
 	 * So a sequence would look like:
 	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
 	 * Which simplifies to:
 	 * uN + kN + kN-1 + ... + k1 + k0
 	 * Note however, if we stop before we reach the ORIGIN we get:
 	 * uN + kN + kN-1 + ... + kM - uM-1
 	 */
 	ss_mv_cnt = 0;
 	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
 	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
 	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
 	for (snap = list_head(&ddpa->shared_snaps); snap;
 	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		uint64_t val, dlused, dlcomp, dluncomp;
 		dsl_dataset_t *ds = snap->ds;
 
 		ss_mv_cnt++;
 
 		/*
 		 * If there are long holds, we won't be able to evict
 		 * the objset.
 		 */
 		if (dsl_dataset_long_held(ds)) {
 			err = SET_ERROR(EBUSY);
 			goto out;
 		}
 
 		/* Check that the snapshot name does not conflict */
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		if (strlen(ds->ds_snapname) >= max_snap_len) {
 			err = SET_ERROR(ENAMETOOLONG);
 			goto out;
 		}
 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
 		if (err == 0) {
 			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
 			err = SET_ERROR(EEXIST);
 			goto out;
 		}
 		if (err != ENOENT)
 			goto out;
 
 		/* The very first snapshot does not have a deadlist */
 		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
 			continue;
 
 		dsl_deadlist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp);
 		ddpa->used += dlused;
 		ddpa->comp += dlcomp;
 		ddpa->uncomp += dluncomp;
 	}
 
 	/*
 	 * If we are a clone of a clone then we never reached ORIGIN,
 	 * so we need to subtract out the clone origin's used space.
 	 */
 	if (ddpa->origin_origin) {
 		ddpa->used -=
 		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
 		ddpa->comp -=
 		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
 		ddpa->uncomp -=
 		    dsl_dataset_phys(ddpa->origin_origin)->
 		    ds_uncompressed_bytes;
 	}
 
 	/* Check that there is enough space and limit headroom here */
 	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
 	    0, ss_mv_cnt, ddpa->used, ddpa->cr);
 	if (err != 0)
 		goto out;
 
 	/*
 	 * Compute the amounts of space that will be used by snapshots
 	 * after the promotion (for both origin and clone).  For each,
 	 * it is the amount of space that will be on all of their
 	 * deadlists (that was not born before their new origin).
 	 */
 	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		uint64_t space;
 
 		/*
 		 * Note, typically this will not be a clone of a clone,
 		 * so dd_origin_txg will be < TXG_INITIAL, so
 		 * these snaplist_space() -> dsl_deadlist_space_range()
 		 * calls will be fast because they do not have to
 		 * iterate over all bps.
 		 */
 		snap = list_head(&ddpa->origin_snaps);
 		err = snaplist_space(&ddpa->shared_snaps,
 		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
 		if (err != 0)
 			goto out;
 
 		err = snaplist_space(&ddpa->clone_snaps,
 		    snap->ds->ds_dir->dd_origin_txg, &space);
 		if (err != 0)
 			goto out;
 		ddpa->cloneusedsnap += space;
 	}
 	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
 	    DD_FLAG_USED_BREAKDOWN) {
 		err = snaplist_space(&ddpa->origin_snaps,
 		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
 		    &ddpa->originusedsnap);
 		if (err != 0)
 			goto out;
 	}
 
 out:
 	promote_rele(ddpa, FTAG);
 	return (err);
 }
 
 static void
 dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_promote_arg_t *ddpa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	struct promotenode *snap;
 	dsl_dataset_t *origin_ds;
 	dsl_dataset_t *origin_head;
 	dsl_dir_t *dd;
 	dsl_dir_t *odd = NULL;
 	uint64_t oldnext_obj;
 	int64_t delta;
 #if defined(__FreeBSD__) && defined(_KERNEL)
 	char *oldname, *newname;
 #endif
 
 	VERIFY0(promote_hold(ddpa, dp, FTAG));
 	hds = ddpa->ddpa_clone;
 
 	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
 
 	snap = list_head(&ddpa->shared_snaps);
 	origin_ds = snap->ds;
 	dd = hds->ds_dir;
 
 	snap = list_head(&ddpa->origin_snaps);
 	origin_head = snap->ds;
 
 	/*
 	 * We need to explicitly open odd, since origin_ds's dd will be
 	 * changing.
 	 */
 	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
 	    NULL, FTAG, &odd));
 
 	/* change origin's next snap */
 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
 	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
 	snap = list_tail(&ddpa->clone_snaps);
 	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
 	    origin_ds->ds_object);
 	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
 
 	/* change the origin's next clone */
 	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
 		dsl_dataset_remove_from_next_clones(origin_ds,
 		    snap->ds->ds_object, tx);
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
 		    oldnext_obj, tx));
 	}
 
 	/* change origin */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
 	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
 	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
 	dmu_buf_will_dirty(odd->dd_dbuf, tx);
 	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
 	origin_head->ds_dir->dd_origin_txg =
 	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
 
 	/* change dd_clone entries */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
 		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
 		    hds->ds_object, tx));
 
 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
 		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
 		    origin_head->ds_object, tx));
 		if (dsl_dir_phys(dd)->dd_clones == 0) {
 			dsl_dir_phys(dd)->dd_clones =
 			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
 			    DMU_OT_NONE, 0, tx);
 		}
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
 	}
 
 #if defined(__FreeBSD__) && defined(_KERNEL)
 	/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
 	mutex_enter(&spa_namespace_lock);
 
 	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 #endif
 
 	/* move snapshots to this dir */
 	for (snap = list_head(&ddpa->shared_snaps); snap;
 	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		dsl_dataset_t *ds = snap->ds;
 
 		/*
 		 * Property callbacks are registered to a particular
 		 * dsl_dir.  Since ours is changing, evict the objset
 		 * so that they will be unregistered from the old dsl_dir.
 		 */
 		if (ds->ds_objset) {
 			dmu_objset_evict(ds->ds_objset);
 			ds->ds_objset = NULL;
 		}
 
 		/* move snap name entry */
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		VERIFY0(dsl_dataset_snap_remove(origin_head,
 		    ds->ds_snapname, tx, B_TRUE));
 		VERIFY0(zap_add(dp->dp_meta_objset,
 		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
 		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
 		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
 		ASSERT3P(ds->ds_dir, ==, odd);
 		dsl_dir_rele(ds->ds_dir, ds);
 		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
 #if defined(__FreeBSD__) && defined(_KERNEL)
 		dsl_dataset_name(ds, newname);
 		zfsvfs_update_fromname(oldname, newname);
 		zvol_rename_minors(oldname, newname);
 #endif
 
 		/* move any clone references */
 		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
 		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			zap_cursor_t zc;
 			zap_attribute_t za;
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset,
 			    dsl_dataset_phys(ds)->ds_next_clones_obj);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
 				dsl_dataset_t *cnds;
 				uint64_t o;
 
 				if (za.za_first_integer == oldnext_obj) {
 					/*
 					 * We've already moved the
 					 * origin's reference.
 					 */
 					continue;
 				}
 
 				VERIFY0(dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &cnds));
 				o = dsl_dir_phys(cnds->ds_dir)->
 				    dd_head_dataset_obj;
 
 				VERIFY0(zap_remove_int(dp->dp_meta_objset,
 				    dsl_dir_phys(odd)->dd_clones, o, tx));
 				VERIFY0(zap_add_int(dp->dp_meta_objset,
 				    dsl_dir_phys(dd)->dd_clones, o, tx));
 				dsl_dataset_rele(cnds, FTAG);
 			}
 			zap_cursor_fini(&zc);
 		}
 
 		ASSERT(!dsl_prop_hascb(ds));
 	}
 
 #if defined(__FreeBSD__) && defined(_KERNEL)
 	mutex_exit(&spa_namespace_lock);
 
 	kmem_free(newname, MAXPATHLEN);
 	kmem_free(oldname, MAXPATHLEN);
 #endif
 	/*
 	 * Change space accounting.
 	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
 	 * both be valid, or both be 0 (resulting in delta == 0).  This
 	 * is true for each of {clone,origin} independently.
 	 */
 
 	delta = ddpa->cloneusedsnap -
 	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, >=, 0);
 	ASSERT3U(ddpa->used, >=, delta);
 	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(dd, DD_USED_HEAD,
 	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
 
 	delta = ddpa->originusedsnap -
 	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, <=, 0);
 	ASSERT3U(ddpa->used, >=, -delta);
 	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(odd, DD_USED_HEAD,
 	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
 
 	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
 
 	/* log history record */
 	spa_history_log_internal_ds(hds, "promote", tx, "");
 
 	dsl_dir_rele(odd, FTAG);
 	promote_rele(ddpa, FTAG);
 }
 
 /*
  * Make a list of dsl_dataset_t's for the snapshots between first_obj
  * (exclusive) and last_obj (inclusive).  The list will be in reverse
  * order (last_obj will be the list_head()).  If first_obj == 0, do all
  * snapshots back to this dataset's origin.
  */
 static int
 snaplist_make(dsl_pool_t *dp,
     uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
 {
 	uint64_t obj = last_obj;
 
 	list_create(l, sizeof (struct promotenode),
 	    offsetof(struct promotenode, link));
 
 	while (obj != first_obj) {
 		dsl_dataset_t *ds;
 		struct promotenode *snap;
 		int err;
 
 		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
 		ASSERT(err != ENOENT);
 		if (err != 0)
 			return (err);
 
 		if (first_obj == 0)
 			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
 
 		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
 		snap->ds = ds;
 		list_insert_tail(l, snap);
 		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	}
 
 	return (0);
 }
 
 static int
 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
 {
 	struct promotenode *snap;
 
 	*spacep = 0;
 	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
 		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
 		*spacep += used;
 	}
 	return (0);
 }
 
 static void
 snaplist_destroy(list_t *l, void *tag)
 {
 	struct promotenode *snap;
 
 	if (l == NULL || !list_link_active(&l->list_head))
 		return;
 
 	while ((snap = list_tail(l)) != NULL) {
 		list_remove(l, snap);
 		dsl_dataset_rele(snap->ds, tag);
 		kmem_free(snap, sizeof (*snap));
 	}
 	list_destroy(l);
 }
 
 static int
 promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
 {
 	int error;
 	dsl_dir_t *dd;
 	struct promotenode *snap;
 
 	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
 	    &ddpa->ddpa_clone);
 	if (error != 0)
 		return (error);
 	dd = ddpa->ddpa_clone->ds_dir;
 
 	if (ddpa->ddpa_clone->ds_is_snapshot ||
 	    !dsl_dir_is_clone(dd)) {
 		dsl_dataset_rele(ddpa->ddpa_clone, tag);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
 	    &ddpa->shared_snaps, tag);
 	if (error != 0)
 		goto out;
 
 	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
 	    &ddpa->clone_snaps, tag);
 	if (error != 0)
 		goto out;
 
 	snap = list_head(&ddpa->shared_snaps);
 	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
 	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
 	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
 	    &ddpa->origin_snaps, tag);
 	if (error != 0)
 		goto out;
 
 	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
 		error = dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
 		    tag, &ddpa->origin_origin);
 		if (error != 0)
 			goto out;
 	}
 out:
 	if (error != 0)
 		promote_rele(ddpa, tag);
 	return (error);
 }
 
 static void
 promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
 {
 	snaplist_destroy(&ddpa->shared_snaps, tag);
 	snaplist_destroy(&ddpa->clone_snaps, tag);
 	snaplist_destroy(&ddpa->origin_snaps, tag);
 	if (ddpa->origin_origin != NULL)
 		dsl_dataset_rele(ddpa->origin_origin, tag);
 	dsl_dataset_rele(ddpa->ddpa_clone, tag);
 }
 
 /*
  * Promote a clone.
  *
  * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
  * in with the name.  (It must be at least MAXNAMELEN bytes long.)
  */
 int
 dsl_dataset_promote(const char *name, char *conflsnap)
 {
 	dsl_dataset_promote_arg_t ddpa = { 0 };
 	uint64_t numsnaps;
 	int error;
 	objset_t *os;
 
 	/*
 	 * We will modify space proportional to the number of
 	 * snapshots.  Compute numsnaps.
 	 */
 	error = dmu_objset_hold(name, FTAG, &os);
 	if (error != 0)
 		return (error);
 	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
 	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
 	    &numsnaps);
 	dmu_objset_rele(os, FTAG);
 	if (error != 0)
 		return (error);
 
 	ddpa.ddpa_clonename = name;
 	ddpa.err_ds = conflsnap;
 	ddpa.cr = CRED();
 
 	return (dsl_sync_task(name, dsl_dataset_promote_check,
 	    dsl_dataset_promote_sync, &ddpa,
 	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
 {
 	int64_t unused_refres_delta;
 
 	/* they should both be heads */
 	if (clone->ds_is_snapshot ||
 	    origin_head->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	/* if we are not forcing, the branch point should be just before them */
 	if (!force && clone->ds_prev != origin_head->ds_prev)
 		return (SET_ERROR(EINVAL));
 
 	/* clone should be the clone (unless they are unrelated) */
 	if (clone->ds_prev != NULL &&
 	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
 	    origin_head->ds_dir != clone->ds_prev->ds_dir)
 		return (SET_ERROR(EINVAL));
 
 	/* the clone should be a child of the origin */
 	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
 		return (SET_ERROR(EINVAL));
 
 	/* origin_head shouldn't be modified unless 'force' */
 	if (!force &&
 	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
 		return (SET_ERROR(ETXTBSY));
 
 	/* origin_head should have no long holds (e.g. is not mounted) */
 	if (dsl_dataset_handoff_check(origin_head, owner, tx))
 		return (SET_ERROR(EBUSY));
 
 	/* check amount of any unconsumed refreservation */
 	unused_refres_delta =
 	    (int64_t)MIN(origin_head->ds_reserved,
 	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
 	    (int64_t)MIN(origin_head->ds_reserved,
 	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	if (unused_refres_delta > 0 &&
 	    unused_refres_delta >
 	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
 		return (SET_ERROR(ENOSPC));
 
 	/* clone can't be over the head's refquota */
 	if (origin_head->ds_quota != 0 &&
 	    dsl_dataset_phys(clone)->ds_referenced_bytes >
 	    origin_head->ds_quota)
 		return (SET_ERROR(EDQUOT));
 
 	return (0);
 }
 
 void
 dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int64_t unused_refres_delta;
 
 	ASSERT(clone->ds_reserved == 0);
 	ASSERT(origin_head->ds_quota == 0 ||
 	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
 	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
 
 	/*
 	 * Swap per-dataset feature flags.
 	 */
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (!(spa_feature_table[f].fi_flags &
 		    ZFEATURE_FLAG_PER_DATASET)) {
 			ASSERT(!clone->ds_feature_inuse[f]);
 			ASSERT(!origin_head->ds_feature_inuse[f]);
 			continue;
 		}
 
 		boolean_t clone_inuse = clone->ds_feature_inuse[f];
 		boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
 
 		if (clone_inuse) {
 			dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
 			clone->ds_feature_inuse[f] = B_FALSE;
 		}
 		if (origin_head_inuse) {
 			dsl_dataset_deactivate_feature(origin_head->ds_object,
 			    f, tx);
 			origin_head->ds_feature_inuse[f] = B_FALSE;
 		}
 		if (clone_inuse) {
 			dsl_dataset_activate_feature(origin_head->ds_object,
 			    f, tx);
 			origin_head->ds_feature_inuse[f] = B_TRUE;
 		}
 		if (origin_head_inuse) {
 			dsl_dataset_activate_feature(clone->ds_object, f, tx);
 			clone->ds_feature_inuse[f] = B_TRUE;
 		}
 	}
 
 	dmu_buf_will_dirty(clone->ds_dbuf, tx);
 	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 
 	if (clone->ds_objset != NULL) {
 		dmu_objset_evict(clone->ds_objset);
 		clone->ds_objset = NULL;
 	}
 
 	if (origin_head->ds_objset != NULL) {
 		dmu_objset_evict(origin_head->ds_objset);
 		origin_head->ds_objset = NULL;
 	}
 
 	unused_refres_delta =
 	    (int64_t)MIN(origin_head->ds_reserved,
 	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
 	    (int64_t)MIN(origin_head->ds_reserved,
 	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	/*
 	 * Reset origin's unique bytes, if it exists.
 	 */
 	if (clone->ds_prev) {
 		dsl_dataset_t *origin = clone->ds_prev;
 		uint64_t comp, uncomp;
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		dsl_deadlist_space_range(&clone->ds_deadlist,
 		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
 		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
 	}
 
 	/* swap blkptrs */
 	{
 		blkptr_t tmp;
 		tmp = dsl_dataset_phys(origin_head)->ds_bp;
 		dsl_dataset_phys(origin_head)->ds_bp =
 		    dsl_dataset_phys(clone)->ds_bp;
 		dsl_dataset_phys(clone)->ds_bp = tmp;
 	}
 
 	/* set dd_*_bytes */
 	{
 		int64_t dused, dcomp, duncomp;
 		uint64_t cdl_used, cdl_comp, cdl_uncomp;
 		uint64_t odl_used, odl_comp, odl_uncomp;
 
 		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
 		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
 		dsl_deadlist_space(&clone->ds_deadlist,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
 		dsl_deadlist_space(&origin_head->ds_deadlist,
 		    &odl_used, &odl_comp, &odl_uncomp);
 
 		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
 		    cdl_used -
 		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
 		    odl_used);
 		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
 		    cdl_comp -
 		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
 		    odl_comp);
 		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
 		    cdl_uncomp -
 		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
 		    odl_uncomp);
 
 		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
 		    dused, dcomp, duncomp, tx);
 		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
 		    -dused, -dcomp, -duncomp, tx);
 
 		/*
 		 * The difference in the space used by snapshots is the
 		 * difference in snapshot space due to the head's
 		 * deadlist (since that's the only thing that's
 		 * changing that affects the snapused).
 		 */
 		dsl_deadlist_space_range(&clone->ds_deadlist,
 		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
 		dsl_deadlist_space_range(&origin_head->ds_deadlist,
 		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &odl_used, &odl_comp, &odl_uncomp);
 		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
 		    DD_USED_HEAD, DD_USED_SNAP, NULL);
 	}
 
 	/* swap ds_*_bytes */
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
 	    dsl_dataset_phys(clone)->ds_referenced_bytes);
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
 	    dsl_dataset_phys(clone)->ds_compressed_bytes);
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
 	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
 	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	/* apply any parent delta for change in unconsumed refreservation */
 	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
 	    unused_refres_delta, 0, 0, tx);
 
 	/*
 	 * Swap deadlists.
 	 */
 	dsl_deadlist_close(&clone->ds_deadlist);
 	dsl_deadlist_close(&origin_head->ds_deadlist);
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
 	    dsl_dataset_phys(clone)->ds_deadlist_obj);
 	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
 	    dsl_dataset_phys(clone)->ds_deadlist_obj);
 	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
 	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
 
 	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
 
 	spa_history_log_internal_ds(clone, "clone swap", tx,
 	    "parent=%s", origin_head->ds_dir->dd_myname);
 }
 
 /*
  * Given a pool name and a dataset object number in that pool,
  * return the name of that dataset.
  */
 int
 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_pool_hold(pname, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 	if (error == 0) {
 		dsl_dataset_name(ds, buf);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dsl_pool_rele(dp, FTAG);
 
 	return (error);
 }
 
 int
 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
 {
 	int error = 0;
 
 	ASSERT3S(asize, >, 0);
 
 	/*
 	 * *ref_rsrv is the portion of asize that will come from any
 	 * unconsumed refreservation space.
 	 */
 	*ref_rsrv = 0;
 
 	mutex_enter(&ds->ds_lock);
 	/*
 	 * Make a space adjustment for reserved bytes.
 	 */
 	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
 		ASSERT3U(*used, >=,
 		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
 		*used -=
 		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
 		*ref_rsrv =
 		    asize - MIN(asize, parent_delta(ds, asize + inflight));
 	}
 
 	if (!check_quota || ds->ds_quota == 0) {
 		mutex_exit(&ds->ds_lock);
 		return (0);
 	}
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota, they get to try again unless the actual
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
 	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
 	    ds->ds_quota) {
 		if (inflight > 0 ||
 		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
 			error = SET_ERROR(ERESTART);
 		else
 			error = SET_ERROR(EDQUOT);
 	}
 	mutex_exit(&ds->ds_lock);
 
 	return (error);
 }
 
 typedef struct dsl_dataset_set_qr_arg {
 	const char *ddsqra_name;
 	zprop_source_t ddsqra_source;
 	uint64_t ddsqra_value;
 } dsl_dataset_set_qr_arg_t;
 
 
 /* ARGSUSED */
 static int
 dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 	uint64_t newval;
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
 		return (SET_ERROR(ENOTSUP));
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (ds->ds_is_snapshot) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = dsl_prop_predict(ds->ds_dir,
 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	if (newval == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
 	    newval < ds->ds_reserved) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t newval;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
 	dsl_prop_set_sync_impl(ds,
 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
 	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 	    &ddsqra->ddsqra_value, tx);
 
 	VERIFY0(dsl_prop_get_int_ds(ds,
 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
 
 	if (ds->ds_quota != newval) {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ds->ds_quota = newval;
 	}
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
     uint64_t refquota)
 {
 	dsl_dataset_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = dsname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = refquota;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
 	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 static int
 dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 	uint64_t newval, unique;
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
 		return (SET_ERROR(ENOTSUP));
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (ds->ds_is_snapshot) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = dsl_prop_predict(ds->ds_dir,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	mutex_enter(&ds->ds_lock);
 	if (!DS_UNIQUE_IS_ACCURATE(ds))
 		dsl_dataset_recalc_head_uniq(ds);
 	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
 	mutex_exit(&ds->ds_lock);
 
 	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
 		uint64_t delta = MAX(unique, newval) -
 		    MAX(unique, ds->ds_reserved);
 
 		if (delta >
 		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
 		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(ENOSPC));
 		}
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
     zprop_source_t source, uint64_t value, dmu_tx_t *tx)
 {
 	uint64_t newval;
 	uint64_t unique;
 	int64_t delta;
 
 	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 	    source, sizeof (value), 1, &value, tx);
 
 	VERIFY0(dsl_prop_get_int_ds(ds,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
 	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
 	delta = MAX(0, (int64_t)(newval - unique)) -
 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
 	ds->ds_reserved = newval;
 	mutex_exit(&ds->ds_lock);
 
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
 	mutex_exit(&ds->ds_dir->dd_lock);
 }
 
 static void
 dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 	dsl_dataset_set_refreservation_sync_impl(ds,
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
     uint64_t refreservation)
 {
 	dsl_dataset_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = dsname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = refreservation;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
 	    dsl_dataset_set_refreservation_sync, &ddsqra,
 	    0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
  * Return (in *usedp) the amount of space written in new that is not
  * present in oldsnap.  New may be a snapshot or the head.  Old must be
  * a snapshot before new, in new's filesystem (or its origin).  If not then
  * fail and return EINVAL.
  *
  * The written space is calculated by considering two components:  First, we
  * ignore any freed space, and calculate the written as new's used space
  * minus old's used space.  Next, we add in the amount of space that was freed
  * between the two snapshots, thus reducing new's used space relative to old's.
  * Specifically, this is the space that was born before old->ds_creation_txg,
  * and freed before new (ie. on new's deadlist or a previous deadlist).
  *
  * space freed                         [---------------------]
  * snapshots                       ---O-------O--------O-------O------
  *                                         oldsnap            new
  */
 int
 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	int err = 0;
 	uint64_t snapobj;
 	dsl_pool_t *dp = new->ds_dir->dd_pool;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	*usedp = 0;
 	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
 	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
 
 	*compp = 0;
 	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
 	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
 
 	*uncompp = 0;
 	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
 	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
 
 	snapobj = new->ds_object;
 	while (snapobj != oldsnap->ds_object) {
 		dsl_dataset_t *snap;
 		uint64_t used, comp, uncomp;
 
 		if (snapobj == new->ds_object) {
 			snap = new;
 		} else {
 			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
 			if (err != 0)
 				break;
 		}
 
 		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
 		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
 			/*
 			 * The blocks in the deadlist can not be born after
 			 * ds_prev_snap_txg, so get the whole deadlist space,
 			 * which is more efficient (especially for old-format
 			 * deadlists).  Unfortunately the deadlist code
 			 * doesn't have enough information to make this
 			 * optimization itself.
 			 */
 			dsl_deadlist_space(&snap->ds_deadlist,
 			    &used, &comp, &uncomp);
 		} else {
 			dsl_deadlist_space_range(&snap->ds_deadlist,
 			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
 			    &used, &comp, &uncomp);
 		}
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
 		/*
 		 * If we get to the beginning of the chain of snapshots
 		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
 		 * was not a snapshot of/before new.
 		 */
 		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 		if (snap != new)
 			dsl_dataset_rele(snap, FTAG);
 		if (snapobj == 0) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
 
 	}
 	return (err);
 }
 
 /*
  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
  * lastsnap, and all snapshots in between are deleted.
  *
  * blocks that would be freed            [---------------------------]
  * snapshots                       ---O-------O--------O-------O--------O
  *                                        firstsnap        lastsnap
  *
  * This is the set of blocks that were born after the snap before firstsnap,
  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
  * We calculate this by iterating over the relevant deadlists (from the snap
  * after lastsnap, backward to the snap after firstsnap), summing up the
  * space on the deadlist that was born after the snap before firstsnap.
  */
 int
 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
     dsl_dataset_t *lastsnap,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	int err = 0;
 	uint64_t snapobj;
 	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
 
 	ASSERT(firstsnap->ds_is_snapshot);
 	ASSERT(lastsnap->ds_is_snapshot);
 
 	/*
 	 * Check that the snapshots are in the same dsl_dir, and firstsnap
 	 * is before lastsnap.
 	 */
 	if (firstsnap->ds_dir != lastsnap->ds_dir ||
 	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
 	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
 		return (SET_ERROR(EINVAL));
 
 	*usedp = *compp = *uncompp = 0;
 
 	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
 	while (snapobj != firstsnap->ds_object) {
 		dsl_dataset_t *ds;
 		uint64_t used, comp, uncomp;
 
 		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
 		if (err != 0)
 			break;
 
 		dsl_deadlist_space_range(&ds->ds_deadlist,
 		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
 		    &used, &comp, &uncomp);
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
 		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		ASSERT3U(snapobj, !=, 0);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	return (err);
 }
 
 /*
  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
  * For example, they could both be snapshots of the same filesystem, and
  * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
  * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
  * filesystem.  Or 'earlier' could be the origin's origin.
  *
  * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
  */
 boolean_t
 dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
     uint64_t earlier_txg)
 {
 	dsl_pool_t *dp = later->ds_dir->dd_pool;
 	int error;
 	boolean_t ret;
 
 	ASSERT(dsl_pool_config_held(dp));
 	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
 
 	if (earlier_txg == 0)
 		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
 
 	if (later->ds_is_snapshot &&
 	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
 		return (B_FALSE);
 
 	if (later->ds_dir == earlier->ds_dir)
 		return (B_TRUE);
 	if (!dsl_dir_is_clone(later->ds_dir))
 		return (B_FALSE);
 
 	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
 		return (B_TRUE);
 	dsl_dataset_t *origin;
 	error = dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
 	if (error != 0)
 		return (B_FALSE);
 	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
 	dsl_dataset_rele(origin, FTAG);
 	return (ret);
 }
 
 void
 dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
 }
 
 boolean_t
 dsl_dataset_is_zapified(dsl_dataset_t *ds)
 {
 	dmu_object_info_t doi;
 
 	dmu_object_info_from_db(ds->ds_dbuf, &doi);
 	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
 }
 
 boolean_t
 dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
 {
 	return (dsl_dataset_is_zapified(ds) &&
 	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c	(revision 296519)
@@ -1,538 +1,539 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dsl_dataset.h>
 #include <sys/dmu.h>
 #include <sys/refcount.h>
 #include <sys/zap.h>
 #include <sys/zfs_context.h>
 #include <sys/dsl_pool.h>
 
 /*
  * Deadlist concurrency:
  *
  * Deadlists can only be modified from the syncing thread.
  *
  * Except for dsl_deadlist_insert(), it can only be modified with the
  * dp_config_rwlock held with RW_WRITER.
  *
  * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
  * be called concurrently, from open context, with the dl_config_rwlock held
  * with RW_READER.
  *
  * Therefore, we only need to provide locking between dsl_deadlist_insert() and
  * the accessors, protecting:
  *     dl_phys->dl_used,comp,uncomp
  *     and protecting the dl_tree from being loaded.
  * The locking is provided by dl_lock.  Note that locking on the bpobj_t
  * provides its own locking, and dl_oldfmt is immutable.
  */
 
 static int
 dsl_deadlist_compare(const void *arg1, const void *arg2)
 {
 	const dsl_deadlist_entry_t *dle1 = arg1;
 	const dsl_deadlist_entry_t *dle2 = arg2;
 
 	if (dle1->dle_mintxg < dle2->dle_mintxg)
 		return (-1);
 	else if (dle1->dle_mintxg > dle2->dle_mintxg)
 		return (+1);
 	else
 		return (0);
 }
 
 static void
 dsl_deadlist_load_tree(dsl_deadlist_t *dl)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	ASSERT(!dl->dl_oldfmt);
 	if (dl->dl_havetree)
 		return;
 
 	avl_create(&dl->dl_tree, dsl_deadlist_compare,
 	    sizeof (dsl_deadlist_entry_t),
 	    offsetof(dsl_deadlist_entry_t, dle_node));
 	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
 		dle->dle_mintxg = strtonum(za.za_name, NULL);
 		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
 		    za.za_first_integer));
 		avl_add(&dl->dl_tree, dle);
 	}
 	zap_cursor_fini(&zc);
 	dl->dl_havetree = B_TRUE;
 }
 
 void
 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
 {
 	dmu_object_info_t doi;
 
 	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
 	dl->dl_os = os;
 	dl->dl_object = object;
 	VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
 	dmu_object_info_from_db(dl->dl_dbuf, &doi);
 	if (doi.doi_type == DMU_OT_BPOBJ) {
 		dmu_buf_rele(dl->dl_dbuf, dl);
 		dl->dl_dbuf = NULL;
 		dl->dl_oldfmt = B_TRUE;
 		VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
 		return;
 	}
 
 	dl->dl_oldfmt = B_FALSE;
 	dl->dl_phys = dl->dl_dbuf->db_data;
 	dl->dl_havetree = B_FALSE;
 }
 
 void
 dsl_deadlist_close(dsl_deadlist_t *dl)
 {
 	void *cookie = NULL;
 	dsl_deadlist_entry_t *dle;
 
 	dl->dl_os = NULL;
 
 	if (dl->dl_oldfmt) {
 		dl->dl_oldfmt = B_FALSE;
 		bpobj_close(&dl->dl_bpobj);
 		return;
 	}
 
 	if (dl->dl_havetree) {
 		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
 		    != NULL) {
 			bpobj_close(&dle->dle_bpobj);
 			kmem_free(dle, sizeof (*dle));
 		}
 		avl_destroy(&dl->dl_tree);
 	}
 	dmu_buf_rele(dl->dl_dbuf, dl);
 	mutex_destroy(&dl->dl_lock);
 	dl->dl_dbuf = NULL;
 	dl->dl_phys = NULL;
 }
 
 uint64_t
 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
 {
 	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
 		return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
 	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
 	    sizeof (dsl_deadlist_phys_t), tx));
 }
 
 void
 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
 {
 	dmu_object_info_t doi;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
 	if (doi.doi_type == DMU_OT_BPOBJ) {
 		bpobj_free(os, dlobj, tx);
 		return;
 	}
 
 	for (zap_cursor_init(&zc, os, dlobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t obj = za.za_first_integer;
 		if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
 			bpobj_decr_empty(os, tx);
 		else
 			bpobj_free(os, obj, tx);
 	}
 	zap_cursor_fini(&zc);
 	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
 }
 
 static void
 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
     const blkptr_t *bp, dmu_tx_t *tx)
 {
 	if (dle->dle_bpobj.bpo_object ==
 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
 		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 		bpobj_close(&dle->dle_bpobj);
 		bpobj_decr_empty(dl->dl_os, tx);
 		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, obj, tx));
 	}
 	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
 }
 
 static void
 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
     uint64_t obj, dmu_tx_t *tx)
 {
 	if (dle->dle_bpobj.bpo_object !=
 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
 		bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
 	} else {
 		bpobj_close(&dle->dle_bpobj);
 		bpobj_decr_empty(dl->dl_os, tx);
 		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, obj, tx));
 	}
 }
 
 void
 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;
 
 	if (dl->dl_oldfmt) {
 		bpobj_enqueue(&dl->dl_bpobj, bp, tx);
 		return;
 	}
 
 	dsl_deadlist_load_tree(dl);
 
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	mutex_enter(&dl->dl_lock);
 	dl->dl_phys->dl_used +=
 	    bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
 	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
 	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
 	mutex_exit(&dl->dl_lock);
 
 	dle_tofind.dle_mintxg = bp->blk_birth;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 	else
 		dle = AVL_PREV(&dl->dl_tree, dle);
 	dle_enqueue(dl, dle, bp, tx);
 }
 
 /*
  * Insert new key in deadlist, which must be > all current entries.
  * mintxg is not inclusive.
  */
 void
 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 {
 	uint64_t obj;
 	dsl_deadlist_entry_t *dle;
 
 	if (dl->dl_oldfmt)
 		return;
 
 	dsl_deadlist_load_tree(dl);
 
 	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
 	dle->dle_mintxg = mintxg;
 	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 	avl_add(&dl->dl_tree, dle);
 
 	VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
 	    mintxg, obj, tx));
 }
 
 /*
  * Remove this key, merging its entries into the previous key.
  */
 void
 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle, *dle_prev;
 
 	if (dl->dl_oldfmt)
 		return;
 
 	dsl_deadlist_load_tree(dl);
 
 	dle_tofind.dle_mintxg = mintxg;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
 	dle_prev = AVL_PREV(&dl->dl_tree, dle);
 
 	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
 
 	avl_remove(&dl->dl_tree, dle);
 	bpobj_close(&dle->dle_bpobj);
 	kmem_free(dle, sizeof (*dle));
 
 	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
 }
 
 /*
  * Walk ds's snapshots to regenerate generate ZAP & AVL.
  */
 static void
 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
     uint64_t mrs_obj, dmu_tx_t *tx)
 {
 	dsl_deadlist_t dl;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	dsl_deadlist_open(&dl, os, dlobj);
 	if (dl.dl_oldfmt) {
 		dsl_deadlist_close(&dl);
 		return;
 	}
 
 	while (mrs_obj != 0) {
 		dsl_dataset_t *ds;
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
 		dsl_deadlist_add_key(&dl,
 		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 		mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dsl_deadlist_close(&dl);
 }
 
 uint64_t
 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
     uint64_t mrs_obj, dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t *dle;
 	uint64_t newobj;
 
 	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
 
 	if (dl->dl_oldfmt) {
 		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
 		return (newobj);
 	}
 
 	dsl_deadlist_load_tree(dl);
 
 	for (dle = avl_first(&dl->dl_tree); dle;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		uint64_t obj;
 
 		if (dle->dle_mintxg >= maxtxg)
 			break;
 
 		obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
 		    dle->dle_mintxg, obj, tx));
 	}
 	return (newobj);
 }
 
 void
 dsl_deadlist_space(dsl_deadlist_t *dl,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	if (dl->dl_oldfmt) {
 		VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
 		    usedp, compp, uncompp));
 		return;
 	}
 
 	mutex_enter(&dl->dl_lock);
 	*usedp = dl->dl_phys->dl_used;
 	*compp = dl->dl_phys->dl_comp;
 	*uncompp = dl->dl_phys->dl_uncomp;
 	mutex_exit(&dl->dl_lock);
 }
 
 /*
  * return space used in the range (mintxg, maxtxg].
  * Includes maxtxg, does not include mintxg.
  * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
  * larger than any bp in the deadlist (eg. UINT64_MAX)).
  */
 void
 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	dsl_deadlist_entry_t *dle;
 	dsl_deadlist_entry_t dle_tofind;
 	avl_index_t where;
 
 	if (dl->dl_oldfmt) {
 		VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
 		    mintxg, maxtxg, usedp, compp, uncompp));
 		return;
 	}
 
 	*usedp = *compp = *uncompp = 0;
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 	dle_tofind.dle_mintxg = mintxg;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	/*
 	 * If we don't find this mintxg, there shouldn't be anything
 	 * after it either.
 	 */
 	ASSERT(dle != NULL ||
 	    avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
 
 	for (; dle && dle->dle_mintxg < maxtxg;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		uint64_t used, comp, uncomp;
 
 		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
 		    &used, &comp, &uncomp));
 
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 	}
 	mutex_exit(&dl->dl_lock);
 }
 
 static void
 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;
 	uint64_t used, comp, uncomp;
 	bpobj_t bpo;
 
 	VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
 	VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
 	bpobj_close(&bpo);
 
 	dsl_deadlist_load_tree(dl);
 
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	mutex_enter(&dl->dl_lock);
 	dl->dl_phys->dl_used += used;
 	dl->dl_phys->dl_comp += comp;
 	dl->dl_phys->dl_uncomp += uncomp;
 	mutex_exit(&dl->dl_lock);
 
 	dle_tofind.dle_mintxg = birth;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 	dle_enqueue_subobj(dl, dle, obj, tx);
 }
 
 static int
 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_deadlist_t *dl = arg;
 	dsl_deadlist_insert(dl, bp, tx);
 	return (0);
 }
 
 /*
  * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
  * an empty deadlist.
  */
 void
 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	dmu_buf_t *bonus;
 	dsl_deadlist_phys_t *dlp;
 	dmu_object_info_t doi;
 
 	VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
 	if (doi.doi_type == DMU_OT_BPOBJ) {
 		bpobj_t bpo;
 		VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
 		VERIFY3U(0, ==, bpobj_iterate(&bpo,
 		    dsl_deadlist_insert_cb, dl, tx));
 		bpobj_close(&bpo);
 		return;
 	}
 
 	for (zap_cursor_init(&zc, dl->dl_os, obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t mintxg = strtonum(za.za_name, NULL);
 		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
 		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
 	}
 	zap_cursor_fini(&zc);
 
 	VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
 	dlp = bonus->db_data;
 	dmu_buf_will_dirty(bonus, tx);
 	bzero(dlp, sizeof (*dlp));
 	dmu_buf_rele(bonus, FTAG);
 }
 
 /*
  * Remove entries on dl that are >= mintxg, and put them on the bpobj.
  */
 void
 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;
 
 	ASSERT(!dl->dl_oldfmt);
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	dsl_deadlist_load_tree(dl);
 
 	dle_tofind.dle_mintxg = mintxg;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
 	while (dle) {
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_entry_t *dle_next;
 
 		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
 
 		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
 		    &used, &comp, &uncomp));
 		mutex_enter(&dl->dl_lock);
 		ASSERT3U(dl->dl_phys->dl_used, >=, used);
 		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
 		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
 		dl->dl_phys->dl_used -= used;
 		dl->dl_phys->dl_comp -= comp;
 		dl->dl_phys->dl_uncomp -= uncomp;
 		mutex_exit(&dl->dl_lock);
 
 		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, tx));
 
 		dle_next = AVL_NEXT(&dl->dl_tree, dle);
 		avl_remove(&dl->dl_tree, dle);
 		bpobj_close(&dle->dle_bpobj);
 		kmem_free(dle, sizeof (*dle));
 		dle = dle_next;
 	}
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c	(revision 296519)
@@ -1,985 +1,986 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_objset.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_impl.h>
 
 typedef struct dmu_snapshots_destroy_arg {
 	nvlist_t *dsda_snaps;
 	nvlist_t *dsda_successful_snaps;
 	boolean_t dsda_defer;
 	nvlist_t *dsda_errlist;
 } dmu_snapshots_destroy_arg_t;
 
 int
 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
 {
 	if (!ds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	if (dsl_dataset_long_held(ds))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Only allow deferred destroy on pools that support it.
 	 * NOTE: deferred destroy is only supported on snapshots.
 	 */
 	if (defer) {
 		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
 		    SPA_VERSION_USERREFS)
 			return (SET_ERROR(ENOTSUP));
 		return (0);
 	}
 
 	/*
 	 * If this snapshot has an elevated user reference count,
 	 * we can't destroy it yet.
 	 */
 	if (ds->ds_userrefs > 0)
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Can't delete a branch point.
 	 */
 	if (dsl_dataset_phys(ds)->ds_num_children > 1)
 		return (SET_ERROR(EEXIST));
 
 	return (0);
 }
 
 static int
 dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_snapshots_destroy_arg_t *dsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 	int error = 0;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) {
 		dsl_dataset_t *ds;
 
 		error = dsl_dataset_hold(dp, nvpair_name(pair),
 		    FTAG, &ds);
 
 		/*
 		 * If the snapshot does not exist, silently ignore it
 		 * (it's "already destroyed").
 		 */
 		if (error == ENOENT)
 			continue;
 
 		if (error == 0) {
 			error = dsl_destroy_snapshot_check_impl(ds,
 			    dsda->dsda_defer);
 			dsl_dataset_rele(ds, FTAG);
 		}
 
 		if (error == 0) {
 			fnvlist_add_boolean(dsda->dsda_successful_snaps,
 			    nvpair_name(pair));
 		} else {
 			fnvlist_add_int32(dsda->dsda_errlist,
 			    nvpair_name(pair), error);
 		}
 	}
 
 	pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL);
 	if (pair != NULL)
 		return (fnvpair_value_int32(pair));
 
 	return (0);
 }
 
 struct process_old_arg {
 	dsl_dataset_t *ds;
 	dsl_dataset_t *ds_prev;
 	boolean_t after_branch_point;
 	zio_t *pio;
 	uint64_t used, comp, uncomp;
 };
 
 static int
 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	struct process_old_arg *poa = arg;
 	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
 	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
 		    bp->blk_birth >
 		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
 			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
 			    bp_get_dsize_sync(dp->dp_spa, bp);
 		}
 	} else {
 		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
 		poa->comp += BP_GET_PSIZE(bp);
 		poa->uncomp += BP_GET_UCSIZE(bp);
 		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
 	}
 	return (0);
 }
 
 static void
 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
 {
 	struct process_old_arg poa = { 0 };
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t deadlist_obj;
 
 	ASSERT(ds->ds_deadlist.dl_oldfmt);
 	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
 
 	poa.ds = ds;
 	poa.ds_prev = ds_prev;
 	poa.after_branch_point = after_branch_point;
 	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
 	    process_old_cb, &poa, tx));
 	VERIFY0(zio_wait(poa.pio));
 	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
 
 	/* change snapused */
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 	    -poa.used, -poa.comp, -poa.uncomp, tx);
 
 	/* swap next's deadlist to our deadlist */
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_close(&ds_next->ds_deadlist);
 	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
 	dsl_dataset_phys(ds)->ds_deadlist_obj =
 	    dsl_dataset_phys(ds_next)->ds_deadlist_obj;
 	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
 	dsl_deadlist_open(&ds->ds_deadlist, mos,
 	    dsl_dataset_phys(ds)->ds_deadlist_obj);
 	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
 	    dsl_dataset_phys(ds_next)->ds_deadlist_obj);
 }
 
 static void
 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	/*
 	 * If it is the old version, dd_clones doesn't exist so we can't
 	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
 	 * doesn't matter.
 	 */
 	if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
 		return;
 
 	for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
 
 		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 		    za.za_first_integer, FTAG, &clone));
 		if (clone->ds_dir->dd_origin_txg > mintxg) {
 			dsl_deadlist_remove_key(&clone->ds_deadlist,
 			    mintxg, tx);
 			dsl_dataset_remove_clones_key(clone, mintxg, tx);
 		}
 		dsl_dataset_rele(clone, FTAG);
 	}
 	zap_cursor_fini(&zc);
 }
 
 void
 dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 {
 	int err;
 	int after_branch_point = FALSE;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	dsl_dataset_t *ds_prev = NULL;
 	uint64_t obj;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
 	ASSERT(refcount_is_zero(&ds->ds_longholds));
 
 	if (defer &&
 	    (ds->ds_userrefs > 0 ||
 	    dsl_dataset_phys(ds)->ds_num_children > 1)) {
 		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
 		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
 		return;
 	}
 
 	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 
 	/* We need to log before removing it from the namespace. */
 	spa_history_log_internal_ds(ds, "destroy", tx, "");
 
 	dsl_scan_ds_destroyed(ds, tx);
 
 	obj = ds->ds_object;
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (ds->ds_feature_inuse[f]) {
 			dsl_dataset_deactivate_feature(obj, f, tx);
 			ds->ds_feature_inuse[f] = B_FALSE;
 		}
 	}
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		ASSERT3P(ds->ds_prev, ==, NULL);
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
 		after_branch_point =
 		    (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
 
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
 		    dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
 			if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
 				VERIFY0(zap_add_int(mos,
 				    dsl_dataset_phys(ds_prev)->
 				    ds_next_clones_obj,
 				    dsl_dataset_phys(ds)->ds_next_snap_obj,
 				    tx));
 			}
 		}
 		if (!after_branch_point) {
 			dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
 			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 		}
 	}
 
 	dsl_dataset_t *ds_next;
 	uint64_t old_unique;
 	uint64_t used = 0, comp = 0, uncomp = 0;
 
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
 	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
 
 	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
 	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
 	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
 	    ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
 
 	if (ds_next->ds_deadlist.dl_oldfmt) {
 		process_old_deadlist(ds, ds_prev, ds_next,
 		    after_branch_point, tx);
 	} else {
 		/* Adjust prev's unique space. */
 		if (ds_prev && !after_branch_point) {
 			dsl_deadlist_space_range(&ds_next->ds_deadlist,
 			    dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
 			    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 			    &used, &comp, &uncomp);
 			dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
 		}
 
 		/* Adjust snapused. */
 		dsl_deadlist_space_range(&ds_next->ds_deadlist,
 		    dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
 		    &used, &comp, &uncomp);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 		    -used, -comp, -uncomp, tx);
 
 		/* Move blocks to be freed to pool's free list. */
 		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
 		    &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
 		    tx);
 		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
 		    DD_USED_HEAD, used, comp, uncomp, tx);
 
 		/* Merge our deadlist into next's and free it. */
 		dsl_deadlist_merge(&ds_next->ds_deadlist,
 		    dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	}
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
 	/* Collapse range in clone heads */
 	dsl_dataset_remove_clones_key(ds,
 	    dsl_dataset_phys(ds)->ds_creation_txg, tx);
 
 	if (ds_next->ds_is_snapshot) {
 		dsl_dataset_t *ds_nextnext;
 
 		/*
 		 * Update next's unique to include blocks which
 		 * were previously shared by only this snapshot
 		 * and it.  Those blocks will be born after the
 		 * prev snap and before this snap, and will have
 		 * died after the next snap and before the one
 		 * after that (ie. be on the snap after next's
 		 * deadlist).
 		 */
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds_next)->ds_next_snap_obj,
 		    FTAG, &ds_nextnext));
 		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
 		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 		    dsl_dataset_phys(ds)->ds_creation_txg,
 		    &used, &comp, &uncomp);
 		dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
 		dsl_dataset_rele(ds_nextnext, FTAG);
 		ASSERT3P(ds_next->ds_prev, ==, NULL);
 
 		/* Collapse range in this head. */
 		dsl_dataset_t *hds;
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
 		dsl_deadlist_remove_key(&hds->ds_deadlist,
 		    dsl_dataset_phys(ds)->ds_creation_txg, tx);
 		dsl_dataset_rele(hds, FTAG);
 
 	} else {
 		ASSERT3P(ds_next->ds_prev, ==, ds);
 		dsl_dataset_rele(ds_next->ds_prev, ds_next);
 		ds_next->ds_prev = NULL;
 		if (ds_prev) {
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 			    ds_next, &ds_next->ds_prev));
 		}
 
 		dsl_dataset_recalc_head_uniq(ds_next);
 
 		/*
 		 * Reduce the amount of our unconsumed refreservation
 		 * being charged to our parent by the amount of
 		 * new unique data we have gained.
 		 */
 		if (old_unique < ds_next->ds_reserved) {
 			int64_t mrsdelta;
 			uint64_t new_unique =
 			    dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
 			ASSERT(old_unique <= new_unique);
 			mrsdelta = MIN(new_unique - old_unique,
 			    ds_next->ds_reserved - old_unique);
 			dsl_dir_diduse_space(ds->ds_dir,
 			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
 		}
 	}
 	dsl_dataset_rele(ds_next, FTAG);
 
 	/*
 	 * This must be done after the dsl_traverse(), because it will
 	 * re-open the objset.
 	 */
 	if (ds->ds_objset) {
 		dmu_objset_evict(ds->ds_objset);
 		ds->ds_objset = NULL;
 	}
 
 	/* remove from snapshot namespace */
 	dsl_dataset_t *ds_head;
 	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
 	VERIFY0(dsl_dataset_get_snapname(ds));
 #ifdef ZFS_DEBUG
 	{
 		uint64_t val;
 
 		err = dsl_dataset_snap_lookup(ds_head,
 		    ds->ds_snapname, &val);
 		ASSERT0(err);
 		ASSERT3U(val, ==, obj);
 	}
 #endif
 	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
 	dsl_dataset_rele(ds_head, FTAG);
 
 	if (ds_prev != NULL)
 		dsl_dataset_rele(ds_prev, FTAG);
 
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
 	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 		uint64_t count;
 		ASSERT0(zap_count(mos,
 		    dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
 		    count == 0);
 		VERIFY0(dmu_object_free(mos,
 		    dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
 	}
 	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
 		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
 		    tx));
 	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
 		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
 		    tx));
 	dsl_dir_rele(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
 	dmu_object_free_zapified(mos, obj, tx);
 }
 
 static void
 dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_snapshots_destroy_arg_t *dsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 
 	for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) {
 		dsl_dataset_t *ds;
 
 		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
 
 		dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 }
 
 /*
  * The semantics of this function are described in the comment above
  * lzc_destroy_snaps().  To summarize:
  *
  * The snapshots must all be in the same pool.
  *
  * Snapshots that don't exist will be silently ignored (considered to be
  * "already deleted").
  *
  * On success, all snaps will be destroyed and this will return 0.
  * On failure, no snaps will be destroyed, the errlist will be filled in,
  * and this will return an errno.
  */
 int
 dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
     nvlist_t *errlist)
 {
 	dmu_snapshots_destroy_arg_t dsda;
 	int error;
 	nvpair_t *pair;
 
 	pair = nvlist_next_nvpair(snaps, NULL);
 	if (pair == NULL)
 		return (0);
 
 	dsda.dsda_snaps = snaps;
 	dsda.dsda_successful_snaps = fnvlist_alloc();
 	dsda.dsda_defer = defer;
 	dsda.dsda_errlist = errlist;
 
 	error = dsl_sync_task(nvpair_name(pair),
 	    dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
 	    &dsda, 0, ZFS_SPACE_CHECK_NONE);
 	fnvlist_free(dsda.dsda_successful_snaps);
 
 	return (error);
 }
 
 int
 dsl_destroy_snapshot(const char *name, boolean_t defer)
 {
 	int error;
 	nvlist_t *nvl = fnvlist_alloc();
 	nvlist_t *errlist = fnvlist_alloc();
 
 	fnvlist_add_boolean(nvl, name);
 	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
 	fnvlist_free(errlist);
 	fnvlist_free(nvl);
 	return (error);
 }
 
 struct killarg {
 	dsl_dataset_t *ds;
 	dmu_tx_t *tx;
 };
 
 /* ARGSUSED */
 static int
 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct killarg *ka = arg;
 	dmu_tx_t *tx = ka->tx;
 
 	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (zb->zb_level == ZB_ZIL_LEVEL) {
 		ASSERT(zilog != NULL);
 		/*
 		 * It's a block in the intent log.  It has no
 		 * accounting, so just free it.
 		 */
 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
 		ASSERT(zilog == NULL);
 		ASSERT3U(bp->blk_birth, >,
 		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
 
 	return (0);
 }
 
 static void
 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	struct killarg ka;
 
 	/*
 	 * Free everything that we point to (that's born after
 	 * the previous snapshot, if we are a clone)
 	 *
 	 * NB: this should be very quick, because we already
 	 * freed all the objects in open context.
 	 */
 	ka.ds = ds;
 	ka.tx = tx;
 	VERIFY0(traverse_dataset(ds,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
 	    kill_blkptr, &ka));
 	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
 	    dsl_dataset_phys(ds)->ds_unique_bytes == 0);
 }
 
 typedef struct dsl_destroy_head_arg {
 	const char *ddha_name;
 } dsl_destroy_head_arg_t;
 
 int
 dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
 {
 	int error;
 	uint64_t count;
 	objset_t *mos;
 
 	ASSERT(!ds->ds_is_snapshot);
 	if (ds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	if (refcount_count(&ds->ds_longholds) != expected_holds)
 		return (SET_ERROR(EBUSY));
 
 	mos = ds->ds_dir->dd_pool->dp_meta_objset;
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
 	 * (Except if the only snapshots are from the branch we cloned
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Can't delete if there are children of this fs.
 	 */
 	error = zap_count(mos,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
 	if (error != 0)
 		return (error);
 	if (count != 0)
 		return (SET_ERROR(EEXIST));
 
 	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
 	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
 	    ds->ds_prev->ds_userrefs == 0) {
 		/* We need to remove the origin snapshot as well. */
 		if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
 			return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
 static int
 dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_head_arg_t *ddha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	error = dsl_destroy_head_check_impl(ds, 0);
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 static void
 dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	dd_used_t t;
 
 	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
 
 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
 
 	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
 
 	/*
 	 * Decrement the filesystem count for all parent filesystems.
 	 *
 	 * When we receive an incremental stream into a filesystem that already
 	 * exists, a temporary clone is created.  We never count this temporary
 	 * clone, whose name begins with a '%'.
 	 */
 	if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
 		dsl_fs_ss_count_adjust(dd->dd_parent, -1,
 		    DD_FIELD_FILESYSTEM_COUNT, tx);
 
 	/*
 	 * Remove our reservation. The impl() routine avoids setting the
 	 * actual property, which would require the (already destroyed) ds.
 	 */
 	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 
 	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
 	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
 	for (t = 0; t < DD_USED_NUM; t++)
 		ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
 
 	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
 	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
 	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
 	VERIFY0(zap_remove(mos,
 	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
 	    dd->dd_myname, tx));
 
 	dsl_dir_rele(dd, FTAG);
 	dmu_object_free_zapified(mos, ddobj, tx);
 }
 
 void
 dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t obj, ddobj, prevobj = 0;
 	boolean_t rmorigin;
 
 	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 	ASSERT(ds->ds_prev == NULL ||
 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
 	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	/* We need to log before removing it from the namespace. */
 	spa_history_log_internal_ds(ds, "destroy", tx, "");
 
 	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
 	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
 	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
 	    ds->ds_prev->ds_userrefs == 0);
 
 	/* Remove our reservation. */
 	if (ds->ds_reserved != 0) {
 		dsl_dataset_set_refreservation_sync_impl(ds,
 		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
 		    0, tx);
 		ASSERT0(ds->ds_reserved);
 	}
 
 	obj = ds->ds_object;
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (ds->ds_feature_inuse[f]) {
 			dsl_dataset_deactivate_feature(obj, f, tx);
 			ds->ds_feature_inuse[f] = B_FALSE;
 		}
 	}
 
 	dsl_scan_ds_destroyed(ds, tx);
 
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		/* This is a clone */
 		ASSERT(ds->ds_prev != NULL);
 		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
 		    obj);
 		ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
 
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    obj, tx);
 		}
 
 		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
 		dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
 	}
 
 	/*
 	 * Destroy the deadlist.  Unless it's a clone, the
 	 * deadlist should be empty.  (If it's a clone, it's
 	 * safe to ignore the deadlist contents.)
 	 */
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
 	objset_t *os;
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		old_synchronous_dataset_destroy(ds, tx);
 	} else {
 		/*
 		 * Move the bptree into the pool's list of trees to
 		 * clean up and update space accounting information.
 		 */
 		uint64_t used, comp, uncomp;
 
 		zil_destroy_sync(dmu_objset_zil(os), tx);
 
 		if (!spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_ASYNC_DESTROY)) {
 			dsl_scan_t *scn = dp->dp_scan;
 			spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
 			    tx);
 			dp->dp_bptree_obj = bptree_alloc(mos, tx);
 			VERIFY0(zap_add(mos,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 			    &dp->dp_bptree_obj, tx));
 			ASSERT(!scn->scn_async_destroying);
 			scn->scn_async_destroying = B_TRUE;
 		}
 
 		used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
 		comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
 		uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
 
 		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
 		    dsl_dataset_phys(ds)->ds_unique_bytes == used);
 
 		bptree_add(mos, dp->dp_bptree_obj,
 		    &dsl_dataset_phys(ds)->ds_bp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 		    used, comp, uncomp, tx);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 		    -used, -comp, -uncomp, tx);
 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 		    used, comp, uncomp, tx);
 	}
 
 	if (ds->ds_prev != NULL) {
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			VERIFY0(zap_remove_int(mos,
 			    dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
 			    ds->ds_object, tx));
 		}
 		prevobj = ds->ds_prev->ds_object;
 		dsl_dataset_rele(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
 	/*
 	 * This must be done after the dsl_traverse(), because it will
 	 * re-open the objset.
 	 */
 	if (ds->ds_objset) {
 		dmu_objset_evict(ds->ds_objset);
 		ds->ds_objset = NULL;
 	}
 
 	/* Erase the link in the dir */
 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
 	ddobj = ds->ds_dir->dd_object;
 	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
 	VERIFY0(zap_destroy(mos,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
 
 	if (ds->ds_bookmarks != 0) {
 		VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
 		spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
 	}
 
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
 	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
 	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
 	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
 	dsl_dir_rele(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
 	dmu_object_free_zapified(mos, obj, tx);
 
 	dsl_dir_destroy_sync(ddobj, tx);
 
 	if (rmorigin) {
 		dsl_dataset_t *prev;
 		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
 		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
 		dsl_dataset_rele(prev, FTAG);
 	}
 }
 
 static void
 dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_head_arg_t *ddha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
 	dsl_destroy_head_sync_impl(ds, tx);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 static void
 dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_head_arg_t *ddha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
 
 	/* Mark it as inconsistent on-disk, in case we crash */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_destroy_head(const char *name)
 {
 	dsl_destroy_head_arg_t ddha;
 	int error;
 	spa_t *spa;
 	boolean_t isenabled;
 
 #ifdef _KERNEL
 	zfs_destroy_unmount_origin(name);
 #endif
 
 	error = spa_open(name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
 	spa_close(spa, FTAG);
 
 	ddha.ddha_name = name;
 
 	if (!isenabled) {
 		objset_t *os;
 
 		error = dsl_sync_task(name, dsl_destroy_head_check,
 		    dsl_destroy_head_begin_sync, &ddha,
 		    0, ZFS_SPACE_CHECK_NONE);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Head deletion is processed in one txg on old pools;
 		 * remove the objects from open context so that the txg sync
 		 * is not too long.
 		 */
 		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
 		if (error == 0) {
 			uint64_t prev_snap_txg =
 			    dsl_dataset_phys(dmu_objset_ds(os))->
 			    ds_prev_snap_txg;
 			for (uint64_t obj = 0; error == 0;
 			    error = dmu_object_next(os, &obj, FALSE,
 			    prev_snap_txg))
 				(void) dmu_free_long_object(os, obj);
 			/* sync out all frees */
 			txg_wait_synced(dmu_objset_pool(os), 0);
 			dmu_objset_disown(os, FTAG);
 		}
 	}
 
 	return (dsl_sync_task(name, dsl_destroy_head_check,
 	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
  * Note, this function is used as the callback for dmu_objset_find().  We
  * always return 0 so that we will continue to find and process
  * inconsistent datasets, even if we encounter an error trying to
  * process one of them.
  */
 /* ARGSUSED */
 int
 dsl_destroy_inconsistent(const char *dsname, void *arg)
 {
 	objset_t *os;
 
 	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
 		boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
 
 		/*
 		 * If the dataset is inconsistent because a resumable receive
 		 * has failed, then do not destroy it.
 		 */
 		if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
 			need_destroy = B_FALSE;
 
 		dmu_objset_rele(os, FTAG);
 		if (need_destroy)
 			(void) dsl_destroy_head(dsname);
 	}
 	return (0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	(revision 296519)
@@ -1,1171 +1,1172 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_scan.h>
 #include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_userhold.h>
 
 #ifdef __FreeBSD__
 #include <sys/sysctl.h>
 #include <sys/types.h>
 #endif
 
 /*
  * ZFS Write Throttle
  * ------------------
  *
  * ZFS must limit the rate of incoming writes to the rate at which it is able
  * to sync data modifications to the backend storage. Throttling by too much
  * creates an artificial limit; throttling by too little can only be sustained
  * for short periods and would lead to highly lumpy performance. On a per-pool
  * basis, ZFS tracks the amount of modified (dirty) data. As operations change
  * data, the amount of dirty data increases; as ZFS syncs out data, the amount
  * of dirty data decreases. When the amount of dirty data exceeds a
  * predetermined threshold further modifications are blocked until the amount
  * of dirty data decreases (as data is synced out).
  *
  * The limit on dirty data is tunable, and should be adjusted according to
  * both the IO capacity and available memory of the system. The larger the
  * window, the more ZFS is able to aggregate and amortize metadata (and data)
  * changes. However, memory is a limited resource, and allowing for more dirty
  * data comes at the cost of keeping other useful data in memory (for example
  * ZFS data cached by the ARC).
  *
  * Implementation
  *
  * As buffers are modified dsl_pool_willuse_space() increments both the per-
  * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
  * dirty space used; dsl_pool_dirty_space() decrements those values as data
  * is synced out from dsl_pool_sync(). While only the poolwide value is
  * relevant, the per-txg value is useful for debugging. The tunable
  * zfs_dirty_data_max determines the dirty space limit. Once that value is
  * exceeded, new writes are halted until space frees up.
  *
  * The zfs_dirty_data_sync tunable dictates the threshold at which we
  * ensure that there is a txg syncing (see the comment in txg.c for a full
  * description of transaction group stages).
  *
  * The IO scheduler uses both the dirty space limit and current amount of
  * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
  * issues. See the comment in vdev_queue.c for details of the IO scheduler.
  *
  * The delay is also calculated based on the amount of dirty data.  See the
  * comment above dmu_tx_delay() for details.
  */
 
 /*
  * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
  * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
  */
 uint64_t zfs_dirty_data_max;
 uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
 int zfs_dirty_data_max_percent = 10;
 
 /*
  * If there is at least this much dirty data, push out a txg.
  */
 uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
 
 /*
  * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
  * and delay each transaction.
  * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
  */
 int zfs_delay_min_dirty_percent = 60;
 
 /*
  * This controls how quickly the delay approaches infinity.
  * Larger values cause it to delay more for a given amount of dirty data.
  * Therefore larger values will cause there to be less dirty data for a
  * given throughput.
  *
  * For the smoothest delay, this value should be about 1 billion divided
  * by the maximum number of operations per second.  This will smoothly
  * handle between 10x and 1/10th this number.
  *
  * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
  * multiply in dmu_tx_delay().
  */
 uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 
 
 #ifdef __FreeBSD__
 
 extern int zfs_vdev_async_write_active_max_dirty_percent;
 
 SYSCTL_DECL(_vfs_zfs);
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
     &zfs_dirty_data_max, 0,
     "The maximum amount of dirty data in bytes after which new writes are "
     "halted until space becomes available");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
     &zfs_dirty_data_max_max, 0,
     "The absolute cap on dirty_data_max when auto calculating");
 
 static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
     sysctl_zfs_dirty_data_max_percent, "I",
     "The percent of physical memory used to auto calculate dirty_data_max");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN,
     &zfs_dirty_data_sync, 0,
     "Force a txg if the number of dirty buffer bytes exceed this value");
 
 static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
 /* No zfs_delay_min_dirty_percent tunable due to limit requirements */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
     sysctl_zfs_delay_min_dirty_percent, "I",
     "The limit of outstanding dirty data before transations are delayed");
 
 static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
 /* No zfs_delay_scale tunable due to limit requirements */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
     CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
     sysctl_zfs_delay_scale, "QU",
     "Controls how quickly the delay approaches infinity");
 
 static int
 sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
 {
 	int val, err;
 
 	val = zfs_dirty_data_max_percent;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < 0 || val > 100)
 		return (EINVAL);
 
 	zfs_dirty_data_max_percent = val;
 
 	return (0);
 }
 
 static int
 sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
 {
 	int val, err;
 
 	val = zfs_delay_min_dirty_percent;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < zfs_vdev_async_write_active_max_dirty_percent)
 		return (EINVAL);
 
 	zfs_delay_min_dirty_percent = val;
 
 	return (0);
 }
 
 static int
 sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t val;
 	int err;
 
 	val = zfs_delay_scale;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val > UINT64_MAX / zfs_dirty_data_max)
 		return (EINVAL);
 
 	zfs_delay_scale = val;
 
 	return (0);
 }
 #endif
 
 hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
 hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
 
 int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
 	uint64_t obj;
 	int err;
 
 	err = zap_lookup(dp->dp_meta_objset,
 	    dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
 	    name, sizeof (obj), 1, &obj);
 	if (err)
 		return (err);
 
 	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
 }
 
 static dsl_pool_t *
 dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp;
 	blkptr_t *bp = spa_get_rootblkptr(spa);
 
 	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
 	dp->dp_spa = spa;
 	dp->dp_meta_rootbp = *bp;
 	rrw_init(&dp->dp_config_rwlock, B_TRUE);
 	txg_init(dp, txg);
 
 	txg_list_create(&dp->dp_dirty_datasets,
 	    offsetof(dsl_dataset_t, ds_dirty_link));
 	txg_list_create(&dp->dp_dirty_zilogs,
 	    offsetof(zilog_t, zl_dirty_link));
 	txg_list_create(&dp->dp_dirty_dirs,
 	    offsetof(dsl_dir_t, dd_dirty_link));
 	txg_list_create(&dp->dp_sync_tasks,
 	    offsetof(dsl_sync_task_t, dst_node));
 
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
 	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 	    1, 4, 0);
 
 	return (dp);
 }
 
 int
 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 
 	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 	    &dp->dp_meta_objset);
 	if (err != 0)
 		dsl_pool_close(dp);
 	else
 		*dpp = dp;
 
 	return (err);
 }
 
 int
 dsl_pool_open(dsl_pool_t *dp)
 {
 	int err;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 	    &dp->dp_root_dir_obj);
 	if (err)
 		goto out;
 
 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir);
 	if (err)
 		goto out;
 
 	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 	if (err)
 		goto out;
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 		if (err)
 			goto out;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
 		if (err == 0) {
 			err = dsl_dataset_hold_obj(dp,
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
 			    &dp->dp_origin_snap);
 			dsl_dataset_rele(ds, FTAG);
 		}
 		dsl_dir_rele(dd, dp);
 		if (err)
 			goto out;
 	}
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 		    &dp->dp_free_dir);
 		if (err)
 			goto out;
 
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 		if (err)
 			goto out;
 		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 		    dp->dp_meta_objset, obj));
 	}
 
 	/*
 	 * Note: errors ignored, because the leak dir will not exist if we
 	 * have not encountered a leak yet.
 	 */
 	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
 	    &dp->dp_leak_dir);
 
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 		    &dp->dp_bptree_obj);
 		if (err != 0)
 			goto out;
 	}
 
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 		    &dp->dp_empty_bpobj);
 		if (err != 0)
 			goto out;
 	}
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 	    &dp->dp_tmp_userrefs_obj);
 	if (err == ENOENT)
 		err = 0;
 	if (err)
 		goto out;
 
 	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 
 out:
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 	return (err);
 }
 
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
 	/*
 	 * Drop our references from dsl_pool_open().
 	 *
 	 * Since we held the origin_snap from "syncing" context (which
 	 * includes pool-opening context), it actually only got a "ref"
 	 * and not a hold, so just drop that here.
 	 */
 	if (dp->dp_origin_snap)
 		dsl_dataset_rele(dp->dp_origin_snap, dp);
 	if (dp->dp_mos_dir)
 		dsl_dir_rele(dp->dp_mos_dir, dp);
 	if (dp->dp_free_dir)
 		dsl_dir_rele(dp->dp_free_dir, dp);
 	if (dp->dp_leak_dir)
 		dsl_dir_rele(dp->dp_leak_dir, dp);
 	if (dp->dp_root_dir)
 		dsl_dir_rele(dp->dp_root_dir, dp);
 
 	bpobj_close(&dp->dp_free_bpobj);
 
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 	if (dp->dp_meta_objset)
 		dmu_objset_evict(dp->dp_meta_objset);
 
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_zilogs);
 	txg_list_destroy(&dp->dp_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
 	/*
 	 * We can't set retry to TRUE since we're explicitly specifying
 	 * a spa to flush. This is good enough; any missed buffers for
 	 * this spa won't cause trouble, and they'll eventually fall
 	 * out of the ARC just like any other unused buffer.
 	 */
 	arc_flush(dp->dp_spa, FALSE);
 
 	txg_fini(dp);
 	dsl_scan_fini(dp);
 	dmu_buf_user_evict_wait();
 
 	rrw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	taskq_destroy(dp->dp_vnrele_taskq);
 	if (dp->dp_blkstats)
 		kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 	kmem_free(dp, sizeof (dsl_pool_t));
 }
 
 dsl_pool_t *
 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 	objset_t *os;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	/* create and open the MOS (meta-objset) */
 	dp->dp_meta_objset = dmu_objset_create_impl(spa,
 	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 
 	/* create the pool directory */
 	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 	ASSERT0(err);
 
 	/* Initialize scan structures */
 	VERIFY0(dsl_scan_init(dp, txg));
 
 	/* create and open the root dir */
 	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir));
 
 	/* create and open the meta-objset dir */
 	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 	VERIFY0(dsl_pool_open_special_dir(dp,
 	    MOS_DIR_NAME, &dp->dp_mos_dir));
 
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		/* create and open the free dir */
 		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 		    FREE_DIR_NAME, tx);
 		VERIFY0(dsl_pool_open_special_dir(dp,
 		    FREE_DIR_NAME, &dp->dp_free_dir));
 
 		/* create and open the free_bplist */
 		obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
 		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 		    dp->dp_meta_objset, obj));
 	}
 
 	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 		dsl_pool_create_origin(dp, tx);
 
 	/* create the root dataset */
 	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 
 	/* create the root objset */
 	VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 	os = dmu_objset_create_impl(dp->dp_spa, ds,
 	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 #ifdef _KERNEL
 	zfs_create_fs(os, kcred, zplprops, tx);
 #endif
 	dsl_dataset_rele(ds, FTAG);
 
 	dmu_tx_commit(tx);
 
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 
 	return (dp);
 }
 
 /*
  * Account for the meta-objset space in its placeholder dsl_dir.
  */
 void
 dsl_pool_mos_diduse_space(dsl_pool_t *dp,
     int64_t used, int64_t comp, int64_t uncomp)
 {
 	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 	mutex_enter(&dp->dp_lock);
 	dp->dp_mos_used_delta += used;
 	dp->dp_mos_compressed_delta += comp;
 	dp->dp_mos_uncompressed_delta += uncomp;
 	mutex_exit(&dp->dp_lock);
 }
 
 static int
 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_deadlist_t *dl = arg;
 	dsl_deadlist_insert(dl, bp, tx);
 	return (0);
 }
 
 static void
 dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
 	VERIFY0(zio_wait(zio));
 	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 }
 
 static void
 dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 {
 	ASSERT(MUTEX_HELD(&dp->dp_lock));
 
 	if (delta < 0)
 		ASSERT3U(-delta, <=, dp->dp_dirty_total);
 
 	dp->dp_dirty_total += delta;
 
 	/*
 	 * Note: we signal even when increasing dp_dirty_total.
 	 * This ensures forward progress -- each thread wakes the next waiter.
 	 */
 	if (dp->dp_dirty_total <= zfs_dirty_data_max)
 		cv_signal(&dp->dp_spaceavail_cv);
 }
 
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
 	zio_t *zio;
 	dmu_tx_t *tx;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	objset_t *mos = dp->dp_meta_objset;
 	list_t synced_datasets;
 
 	list_create(&synced_datasets, sizeof (dsl_dataset_t),
 	    offsetof(dsl_dataset_t, ds_synced_link));
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Write out all dirty blocks of dirty datasets.
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		/*
 		 * We must not sync any non-MOS datasets twice, because
 		 * we may have taken a snapshot of them.  However, we
 		 * may sync newly-created datasets on pass 2.
 		 */
 		ASSERT(!list_link_active(&ds->ds_synced_link));
 		list_insert_tail(&synced_datasets, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
 	VERIFY0(zio_wait(zio));
 
 	/*
 	 * We have written all of the accounted dirty data, so our
 	 * dp_space_towrite should now be zero.  However, some seldom-used
 	 * code paths do not adhere to this (e.g. dbuf_undirty(), also
 	 * rounding error in dbuf_write_physdone).
 	 * Shore up the accounting of any dirtied space now.
 	 */
 	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
 
 	/*
 	 * After the data blocks have been written (ensured by the zio_wait()
 	 * above), update the user/group space accounting.
 	 */
 	for (ds = list_head(&synced_datasets); ds != NULL;
 	    ds = list_next(&synced_datasets, ds)) {
 		dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 	}
 
 	/*
 	 * Sync the datasets again to push out the changes due to
 	 * userspace updates.  This must be done before we process the
 	 * sync tasks, so that any snapshots will have the correct
 	 * user accounting information (and we won't get confused
 	 * about which blocks are part of the snapshot).
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		ASSERT(list_link_active(&ds->ds_synced_link));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
 	VERIFY0(zio_wait(zio));
 
 	/*
 	 * Now that the datasets have been completely synced, we can
 	 * clean up our in-memory structures accumulated while syncing:
 	 *
 	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
 	 *  - release hold from dsl_dataset_dirty()
 	 */
 	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
 		objset_t *os = ds->ds_objset;
 		bplist_iterate(&ds->ds_pending_deadlist,
 		    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 		ASSERT(!dmu_objset_is_dirty(os, txg));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 	}
 	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
 		dsl_dir_sync(dd, tx);
 	}
 
 	/*
 	 * The MOS's space is accounted for in the pool/$MOS
 	 * (dp_mos_dir).  We can't modify the mos while we're syncing
 	 * it, so we remember the deltas and apply them here.
 	 */
 	if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 	    dp->dp_mos_uncompressed_delta != 0) {
 		dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 		    dp->dp_mos_used_delta,
 		    dp->dp_mos_compressed_delta,
 		    dp->dp_mos_uncompressed_delta, tx);
 		dp->dp_mos_used_delta = 0;
 		dp->dp_mos_compressed_delta = 0;
 		dp->dp_mos_uncompressed_delta = 0;
 	}
 
 	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 		dsl_pool_sync_mos(dp, tx);
 	}
 
 	/*
 	 * If we modify a dataset in the same txg that we want to destroy it,
 	 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 	 * dsl_dir_destroy_check() will fail if there are unexpected holds.
 	 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 	 * and clearing the hold on it) before we process the sync_tasks.
 	 * The MOS data dirtied by the sync_tasks will be synced on the next
 	 * pass.
 	 */
 	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 		dsl_sync_task_t *dst;
 		/*
 		 * No more sync tasks should have been added while we
 		 * were syncing.
 		 */
 		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
 		while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
 			dsl_sync_task_sync(dst, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
 }
 
 void
 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
 	zilog_t *zilog;
 
 	while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 		zil_clean(zilog, txg);
 		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 		dmu_buf_rele(ds->ds_dbuf, zilog);
 	}
 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
 
 /*
  * TRUE if the current thread is the tx_sync_thread or if we
  * are being called from SPA context during pool initialization.
  */
 int
 dsl_pool_sync_context(dsl_pool_t *dp)
 {
 	return (curthread == dp->dp_tx.tx_sync_thread ||
 	    spa_is_initializing(dp->dp_spa));
 }
 
 uint64_t
 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 {
 	uint64_t space, resv;
 
 	/*
 	 * If we're trying to assess whether it's OK to do a free,
 	 * cut the reservation in half to allow forward progress
 	 * (e.g. make it possible to rm(1) files from a full pool).
 	 */
 	space = spa_get_dspace(dp->dp_spa);
 	resv = spa_get_slop_space(dp->dp_spa);
 	if (netfree)
 		resv >>= 1;
 
 	return (space - resv);
 }
 
 boolean_t
 dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 {
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 	boolean_t rv;
 
 	mutex_enter(&dp->dp_lock);
 	if (dp->dp_dirty_total > zfs_dirty_data_sync)
 		txg_kick(dp);
 	rv = (dp->dp_dirty_total > delay_min_bytes);
 	mutex_exit(&dp->dp_lock);
 	return (rv);
 }
 
 void
 dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 {
 	if (space > 0) {
 		mutex_enter(&dp->dp_lock);
 		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
 		dsl_pool_dirty_delta(dp, space);
 		mutex_exit(&dp->dp_lock);
 	}
 }
 
 void
 dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 {
 	ASSERT3S(space, >=, 0);
 	if (space == 0)
 		return;
 	mutex_enter(&dp->dp_lock);
 	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
 		/* XXX writing something we didn't dirty? */
 		space = dp->dp_dirty_pertxg[txg & TXG_MASK];
 	}
 	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
 	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
 	ASSERT3U(dp->dp_dirty_total, >=, space);
 	dsl_pool_dirty_delta(dp, -space);
 	mutex_exit(&dp->dp_lock);
 }
 
 /* ARGSUSED */
 static int
 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	dsl_dataset_t *ds, *prev = NULL;
 	int err;
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
 		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
 			break;
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
 		prev = NULL;
 	}
 
 	if (prev == NULL) {
 		prev = dp->dp_origin_snap;
 
 		/*
 		 * The $ORIGIN can't have any data, or the accounting
 		 * will be wrong.
 		 */
 		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
 
 		/* The origin doesn't get attached to itself */
 		if (ds->ds_object == prev->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			return (0);
 		}
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
 		dsl_dataset_phys(ds)->ds_prev_snap_txg =
 		    dsl_dataset_phys(prev)->ds_creation_txg;
 
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 		dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
 
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
 		dsl_dataset_phys(prev)->ds_num_children++;
 
 		if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
 			ASSERT(ds->ds_prev == NULL);
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 			    ds, &ds->ds_prev));
 		}
 	}
 
 	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
 	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
 
 	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
 		dsl_dataset_phys(prev)->ds_next_clones_obj =
 		    zap_create(dp->dp_meta_objset,
 		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 	}
 	VERIFY0(zap_add_int(dp->dp_meta_objset,
 	    dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
 
 	dsl_dataset_rele(ds, FTAG);
 	if (prev != dp->dp_origin_snap)
 		dsl_dataset_rele(prev, FTAG);
 	return (0);
 }
 
 void
 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap != NULL);
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 	    tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 /* ARGSUSED */
 static int
 upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
 		dsl_dataset_t *origin;
 
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
 
 		if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 			dsl_dir_phys(origin->ds_dir)->dd_clones =
 			    zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
 			    0, tx);
 		}
 
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dsl_dir_phys(origin->ds_dir)->dd_clones,
 		    ds->ds_object, tx));
 
 		dsl_dataset_rele(origin, FTAG);
 	}
 	return (0);
 }
 
 void
 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	uint64_t obj;
 
 	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 	VERIFY0(dsl_pool_open_special_dir(dp,
 	    FREE_DIR_NAME, &dp->dp_free_dir));
 
 	/*
 	 * We can't use bpobj_alloc(), because spa_version() still
 	 * returns the old version, and we need a new-version bpobj with
 	 * subobj support.  So call dmu_object_alloc() directly.
 	 */
 	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 void
 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	uint64_t dsobj;
 	dsl_dataset_t *ds;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap == NULL);
 	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 
 	/* create the origin dir, ds, & snap-ds */
 	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 	    NULL, 0, kcred, tx);
 	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
 	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
 	    dp, &dp->dp_origin_snap));
 	dsl_dataset_rele(ds, FTAG);
 }
 
 taskq_t *
 dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 {
 	return (dp->dp_vnrele_taskq);
 }
 
 /*
  * Walk through the pool-wide zap object of temporary snapshot user holds
  * and release them.
  */
 void
 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 {
 	zap_attribute_t za;
 	zap_cursor_t zc;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 	nvlist_t *holds;
 
 	if (zapobj == 0)
 		return;
 	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 
 	holds = fnvlist_alloc();
 
 	for (zap_cursor_init(&zc, mos, zapobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		char *htag;
 		nvlist_t *tags;
 
 		htag = strchr(za.za_name, '-');
 		*htag = '\0';
 		++htag;
 		if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
 			tags = fnvlist_alloc();
 			fnvlist_add_boolean(tags, htag);
 			fnvlist_add_nvlist(holds, za.za_name, tags);
 			fnvlist_free(tags);
 		} else {
 			fnvlist_add_boolean(tags, htag);
 		}
 	}
 	dsl_dataset_user_release_tmp(dp, holds);
 	fnvlist_free(holds);
 	zap_cursor_fini(&zc);
 }
 
 /*
  * Create the pool-wide zap object for storing temporary snapshot holds.
  */
 void
 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
 
 	ASSERT(dp->dp_tmp_userrefs_obj == 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 }
 
 static int
 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
     const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 	char *name;
 	int error;
 
 	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	/*
 	 * If the pool was created prior to SPA_VERSION_USERREFS, the
 	 * zap object for temporary holds might not exist yet.
 	 */
 	if (zapobj == 0) {
 		if (holding) {
 			dsl_pool_user_hold_create_obj(dp, tx);
 			zapobj = dp->dp_tmp_userrefs_obj;
 		} else {
 			return (SET_ERROR(ENOENT));
 		}
 	}
 
 	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 	if (holding)
 		error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 	else
 		error = zap_remove(mos, zapobj, name, tx);
 	strfree(name);
 
 	return (error);
 }
 
 /*
  * Add a temporary hold for the given dataset object and tag.
  */
 int
 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     uint64_t now, dmu_tx_t *tx)
 {
 	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 }
 
 /*
  * Release a temporary hold for the given dataset object and tag.
  */
 int
 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     dmu_tx_t *tx)
 {
 	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
 	    tx, B_FALSE));
 }
 
 /*
  * DSL Pool Configuration Lock
  *
  * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
  * creation / destruction / rename / property setting).  It must be held for
  * read to hold a dataset or dsl_dir.  I.e. you must call
  * dsl_pool_config_enter() or dsl_pool_hold() before calling
  * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
  * must be held continuously until all datasets and dsl_dirs are released.
  *
  * The only exception to this rule is that if a "long hold" is placed on
  * a dataset, then the dp_config_rwlock may be dropped while the dataset
  * is still held.  The long hold will prevent the dataset from being
  * destroyed -- the destroy will fail with EBUSY.  A long hold can be
  * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
  * (by calling dsl_{dataset,objset}_{try}own{_obj}).
  *
  * Legitimate long-holders (including owners) should be long-running, cancelable
  * tasks that should cause "zfs destroy" to fail.  This includes DMU
  * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
  * "zfs send", and "zfs diff".  There are several other long-holders whose
  * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
  *
  * The usual formula for long-holding would be:
  * dsl_pool_hold()
  * dsl_dataset_hold()
  * ... perform checks ...
  * dsl_dataset_long_hold()
  * dsl_pool_rele()
  * ... perform long-running task ...
  * dsl_dataset_long_rele()
  * dsl_dataset_rele()
  *
  * Note that when the long hold is released, the dataset is still held but
  * the pool is not held.  The dataset may change arbitrarily during this time
  * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
  * dataset except release it.
  *
  * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
  * or modifying operations.
  *
  * Modifying operations should generally use dsl_sync_task().  The synctask
  * infrastructure enforces proper locking strategy with respect to the
  * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
  *
  * Read-only operations will manually hold the pool, then the dataset, obtain
  * information from the dataset, then release the pool and dataset.
  * dmu_objset_{hold,rele}() are convenience routines that also do the pool
  * hold/rele.
  */
 
 int
 dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(name, &spa, tag);
 	if (error == 0) {
 		*dp = spa_get_dsl(spa);
 		dsl_pool_config_enter(*dp, tag);
 	}
 	return (error);
 }
 
 void
 dsl_pool_rele(dsl_pool_t *dp, void *tag)
 {
 	dsl_pool_config_exit(dp, tag);
 	spa_close(dp->dp_spa, tag);
 }
 
 void
 dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
 {
 	/*
 	 * We use a "reentrant" reader-writer lock, but not reentrantly.
 	 *
 	 * The rrwlock can (with the track_all flag) track all reading threads,
 	 * which is very useful for debugging which code path failed to release
 	 * the lock, and for verifying that the *current* thread does hold
 	 * the lock.
 	 *
 	 * (Unlike a rwlock, which knows that N threads hold it for
 	 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
 	 * if any thread holds it for read, even if this thread doesn't).
 	 */
 	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
 	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
 }
 
 void
 dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
 {
 	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
 	rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
 }
 
 void
 dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
 {
 	rrw_exit(&dp->dp_config_rwlock, tag);
 }
 
 boolean_t
 dsl_pool_config_held(dsl_pool_t *dp)
 {
 	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
 }
 
 boolean_t
 dsl_pool_config_held_writer(dsl_pool_t *dp)
 {
 	return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	(revision 296519)
@@ -1,2676 +1,2677 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/space_map.h>
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
 
 /*
  * Allow allocations to switch to gang blocks quickly. We do this to
  * avoid having to load lots of space_maps in a given txg. There are,
  * however, some cases where we want to avoid "fast" ganging and instead
  * we want to do an exhaustive search of all metaslabs on this device.
  * Currently we don't allow any gang, slog, or dump device related allocations
  * to "fast" gang.
  */
 #define	CAN_FASTGANG(flags) \
 	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
 	METASLAB_GANG_AVOID)))
 
 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
 #define	METASLAB_ACTIVE_MASK		\
 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
 
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
     &metaslab_gang_bang, 0,
     "Force gang block allocation for blocks larger than or equal to this value");
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space_map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
 int zfs_condense_pct = 200;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
     &zfs_condense_pct, 0,
     "Condense on-disk spacemap when it is more than this many percents"
     " of in-memory counterpart");
 
 /*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  * space used on disk. In particular, a space map uses data in increments of
  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
  * same number of blocks after condensing. Since the goal of condensing is to
  * reduce the number of IOPs required to read the space map, we only want to
  * condense when we can be sure we will reduce the number of blocks used by the
  * space map. Unfortunately, we cannot precisely compute whether or not this is
  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  * we apply the following heuristic: do not condense a spacemap unless the
  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  * blocks.
  */
 int zfs_metaslab_condense_block_threshold = 4;
 
 /*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  * be eligible for allocation. The value is defined as a percentage of
  * free space. Metaslab groups that have more free space than
  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  * a metaslab group's free space is less than or equal to the
  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  * groups are allowed to accept allocations. Gang blocks are always
  * eligible to allocate on any metaslab group. The default value of 0 means
  * no metaslab group will be excluded based on this criterion.
  */
 int zfs_mg_noalloc_threshold = 0;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
     &zfs_mg_noalloc_threshold, 0,
     "Percentage of metaslab group size that should be free"
     " to make it eligible for allocation");
 
 /*
  * Metaslab groups are considered eligible for allocations if their
  * fragmenation metric (measured as a percentage) is less than or equal to
  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  * then it will be skipped unless all metaslab groups within the metaslab
  * class have also crossed this threshold.
  */
 int zfs_mg_fragmentation_threshold = 85;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
     &zfs_mg_fragmentation_threshold, 0,
     "Percentage of metaslab group size that should be considered "
     "eligible for allocations unless all metaslab groups within the metaslab class "
     "have also crossed this threshold");
 
 /*
  * Allow metaslabs to keep their active state as long as their fragmentation
  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  * active metaslab that exceeds this threshold will no longer keep its active
  * status allowing better metaslabs to be selected.
  */
 int zfs_metaslab_fragmentation_threshold = 70;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
     &zfs_metaslab_fragmentation_threshold, 0,
     "Maximum percentage of metaslab fragmentation level to keep their active state");
 
 /*
  * When set will load all metaslabs when pool is first opened.
  */
 int metaslab_debug_load = 0;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
     &metaslab_debug_load, 0,
     "Load all metaslabs when pool is first opened");
 
 /*
  * When set will prevent metaslabs from being unloaded.
  */
 int metaslab_debug_unload = 0;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
     &metaslab_debug_unload, 0,
     "Prevent metaslabs from being unloaded");
 
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
     &metaslab_df_alloc_threshold, 0,
     "Minimum size which forces the dynamic allocator to change it's allocation strategy");
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
  * Once the space_map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 int metaslab_df_free_pct = 4;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
     &metaslab_df_free_pct, 0,
     "The minimum free space, in percent, which must be available in a "
     "space map to continue allocations in a first-fit fashion");
 
 /*
  * A metaslab is considered "free" if it contains a contiguous
  * segment which is greater than metaslab_min_alloc_size.
  */
 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
     &metaslab_min_alloc_size, 0,
     "A metaslab is considered \"free\" if it contains a contiguous "
     "segment which is greater than vfs.zfs.metaslab.min_alloc_size");
 
 /*
  * Percentage of all cpus that can be used by the metaslab taskq.
  */
 int metaslab_load_pct = 50;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
     &metaslab_load_pct, 0,
     "Percentage of cpus that can be used by the metaslab taskq");
 
 /*
  * Determines how many txgs a metaslab may remain loaded without having any
  * allocations from it. As long as a metaslab continues to be used we will
  * keep it loaded.
  */
 int metaslab_unload_delay = TXG_SIZE * 2;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
     &metaslab_unload_delay, 0,
     "Number of TXGs that an unused metaslab can be kept in memory");
 
 /*
  * Max number of metaslabs per group to preload.
  */
 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
     &metaslab_preload_limit, 0,
     "Max number of metaslabs per group to preload");
 
 /*
  * Enable/disable preloading of metaslab.
  */
 boolean_t metaslab_preload_enabled = B_TRUE;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
     &metaslab_preload_enabled, 0,
     "Max number of metaslabs per group to preload");
 
 /*
  * Enable/disable fragmentation weighting on metaslabs.
  */
 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
     &metaslab_fragmentation_factor_enabled, 0,
     "Enable fragmentation weighting on metaslabs");
 
 /*
  * Enable/disable lba weighting (i.e. outer tracks are given preference).
  */
 boolean_t metaslab_lba_weighting_enabled = B_TRUE;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
     &metaslab_lba_weighting_enabled, 0,
     "Enable LBA weighting (i.e. outer tracks are given preference)");
 
 /*
  * Enable/disable metaslab group biasing.
  */
 boolean_t metaslab_bias_enabled = B_TRUE;
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
     &metaslab_bias_enabled, 0,
     "Enable metaslab group biasing");
 
 static uint64_t metaslab_fragmentation(metaslab_t *);
 
 /*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 
 	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
 
 	return (mc);
 }
 
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
 	ASSERT(mc->mc_rotor == NULL);
 	ASSERT(mc->mc_alloc == 0);
 	ASSERT(mc->mc_deferred == 0);
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
 int
 metaslab_class_validate(metaslab_class_t *mc)
 {
 	metaslab_group_t *mg;
 	vdev_t *vd;
 
 	/*
 	 * Must hold one of the spa_config locks.
 	 */
 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 
 	if ((mg = mc->mc_rotor) == NULL)
 		return (0);
 
 	do {
 		vd = mg->mg_vd;
 		ASSERT(vd->vdev_mg != NULL);
 		ASSERT3P(vd->vdev_top, ==, vd);
 		ASSERT3P(mg->mg_class, ==, mc);
 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 	} while ((mg = mg->mg_next) != mc->mc_rotor);
 
 	return (0);
 }
 
 void
 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 {
 	atomic_add_64(&mc->mc_alloc, alloc_delta);
 	atomic_add_64(&mc->mc_deferred, defer_delta);
 	atomic_add_64(&mc->mc_space, space_delta);
 	atomic_add_64(&mc->mc_dspace, dspace_delta);
 }
 
 void
 metaslab_class_minblocksize_update(metaslab_class_t *mc)
 {
 	metaslab_group_t *mg;
 	vdev_t *vd;
 	uint64_t minashift = UINT64_MAX;
 
 	if ((mg = mc->mc_rotor) == NULL) {
 		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
 		return;
 	}
 
 	do {
 		vd = mg->mg_vd;
 		if (vd->vdev_ashift < minashift)
 			minashift = vd->vdev_ashift;
 	} while ((mg = mg->mg_next) != mc->mc_rotor);
 
 	mc->mc_minblocksize = 1ULL << minashift;
 }
 
 uint64_t
 metaslab_class_get_alloc(metaslab_class_t *mc)
 {
 	return (mc->mc_alloc);
 }
 
 uint64_t
 metaslab_class_get_deferred(metaslab_class_t *mc)
 {
 	return (mc->mc_deferred);
 }
 
 uint64_t
 metaslab_class_get_space(metaslab_class_t *mc)
 {
 	return (mc->mc_space);
 }
 
 uint64_t
 metaslab_class_get_dspace(metaslab_class_t *mc)
 {
 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
 uint64_t
 metaslab_class_get_minblocksize(metaslab_class_t *mc)
 {
 	return (mc->mc_minblocksize);
 }
 
 void
 metaslab_class_histogram_verify(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t *mc_hist;
 	int i;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		/*
 		 * Skip any holes, uninitialized top-levels, or
 		 * vdevs that are not in this metalab class.
 		 */
 		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 			mc_hist[i] += mg->mg_histogram[i];
 	}
 
 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 
 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 /*
  * Calculate the metaslab class's fragmentation metric. The metric
  * is weighted based on the space contribution of each metaslab group.
  * The return value will be a number between 0 and 100 (inclusive), or
  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
  * zfs_frag_table for more information about the metric.
  */
 uint64_t
 metaslab_class_fragmentation(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t fragmentation = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		/*
 		 * Skip any holes, uninitialized top-levels, or
 		 * vdevs that are not in this metalab class.
 		 */
 		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * If a metaslab group does not contain a fragmentation
 		 * metric then just bail out.
 		 */
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 			return (ZFS_FRAG_INVALID);
 		}
 
 		/*
 		 * Determine how much this metaslab_group is contributing
 		 * to the overall pool fragmentation metric.
 		 */
 		fragmentation += mg->mg_fragmentation *
 		    metaslab_group_get_space(mg);
 	}
 	fragmentation /= metaslab_class_get_space(mc);
 
 	ASSERT3U(fragmentation, <=, 100);
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (fragmentation);
 }
 
 /*
  * Calculate the amount of expandable space that is available in
  * this metaslab class. If a device is expanded then its expandable
  * space will be the amount of allocatable space that is currently not
  * part of this metaslab class.
  */
 uint64_t
 metaslab_class_expandable_space(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t space = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		space += tvd->vdev_max_asize - tvd->vdev_asize;
 	}
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (space);
 }
 
 /*
  * ==========================================================================
  * Metaslab groups
  * ==========================================================================
  */
 static int
 metaslab_compare(const void *x1, const void *x2)
 {
 	const metaslab_t *m1 = x1;
 	const metaslab_t *m2 = x2;
 
 	if (m1->ms_weight < m2->ms_weight)
 		return (1);
 	if (m1->ms_weight > m2->ms_weight)
 		return (-1);
 
 	/*
 	 * If the weights are identical, use the offset to force uniqueness.
 	 */
 	if (m1->ms_start < m2->ms_start)
 		return (-1);
 	if (m1->ms_start > m2->ms_start)
 		return (1);
 
 	ASSERT3P(m1, ==, m2);
 
 	return (0);
 }
 
 /*
  * Update the allocatable flag and the metaslab group's capacity.
  * The allocatable flag is set to true if the capacity is below
  * the zfs_mg_noalloc_threshold. If a metaslab group transitions
  * from allocatable to non-allocatable or vice versa then the metaslab
  * group's class is updated to reflect the transition.
  */
 static void
 metaslab_group_alloc_update(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	metaslab_class_t *mc = mg->mg_class;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	boolean_t was_allocatable;
 
 	ASSERT(vd == vd->vdev_top);
 
 	mutex_enter(&mg->mg_lock);
 	was_allocatable = mg->mg_allocatable;
 
 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 	    (vs->vs_space + 1);
 
 	/*
 	 * A metaslab group is considered allocatable if it has plenty
 	 * of free space or is not heavily fragmented. We only take
 	 * fragmentation into account if the metaslab group has a valid
 	 * fragmentation metric (i.e. a value between 0 and 100).
 	 */
 	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 
 	/*
 	 * The mc_alloc_groups maintains a count of the number of
 	 * groups in this metaslab class that are still above the
 	 * zfs_mg_noalloc_threshold. This is used by the allocating
 	 * threads to determine if they should avoid allocations to
 	 * a given group. The allocator will avoid allocations to a group
 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
 	 * and there are still other groups that are above the threshold.
 	 * When a group transitions from allocatable to non-allocatable or
 	 * vice versa we update the metaslab class to reflect that change.
 	 * When the mc_alloc_groups value drops to 0 that means that all
 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
 	 * eligible for allocations. This effectively means that all devices
 	 * are balanced again.
 	 */
 	if (was_allocatable && !mg->mg_allocatable)
 		mc->mc_alloc_groups--;
 	else if (!was_allocatable && mg->mg_allocatable)
 		mc->mc_alloc_groups++;
 
 	mutex_exit(&mg->mg_lock);
 }
 
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 {
 	metaslab_group_t *mg;
 
 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
 
 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 
 	return (mg);
 }
 
 void
 metaslab_group_destroy(metaslab_group_t *mg)
 {
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	/*
 	 * We may have gone below zero with the activation count
 	 * either because we never activated in the first place or
 	 * because we're done, and possibly removing the vdev.
 	 */
 	ASSERT(mg->mg_activation_count <= 0);
 
 	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
 	kmem_free(mg, sizeof (metaslab_group_t));
 }
 
 void
 metaslab_group_activate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	metaslab_group_t *mgprev, *mgnext;
 
 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 
 	ASSERT(mc->mc_rotor != mg);
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	ASSERT(mg->mg_activation_count <= 0);
 
 	if (++mg->mg_activation_count <= 0)
 		return;
 
 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_rotor) == NULL) {
 		mg->mg_prev = mg;
 		mg->mg_next = mg;
 	} else {
 		mgnext = mgprev->mg_next;
 		mg->mg_prev = mgprev;
 		mg->mg_next = mgnext;
 		mgprev->mg_next = mg;
 		mgnext->mg_prev = mg;
 	}
 	mc->mc_rotor = mg;
 	metaslab_class_minblocksize_update(mc);
 }
 
 void
 metaslab_group_passivate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	metaslab_group_t *mgprev, *mgnext;
 
 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 
 	if (--mg->mg_activation_count != 0) {
 		ASSERT(mc->mc_rotor != mg);
 		ASSERT(mg->mg_prev == NULL);
 		ASSERT(mg->mg_next == NULL);
 		ASSERT(mg->mg_activation_count < 0);
 		return;
 	}
 
 	taskq_wait(mg->mg_taskq);
 	metaslab_group_alloc_update(mg);
 
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
 
 	if (mg == mgnext) {
 		mc->mc_rotor = NULL;
 	} else {
 		mc->mc_rotor = mgnext;
 		mgprev->mg_next = mgnext;
 		mgnext->mg_prev = mgprev;
 	}
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
 	metaslab_class_minblocksize_update(mc);
 }
 
 uint64_t
 metaslab_group_get_space(metaslab_group_t *mg)
 {
 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 }
 
 void
 metaslab_group_histogram_verify(metaslab_group_t *mg)
 {
 	uint64_t *mg_hist;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t ashift = vd->vdev_ashift;
 	int i;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
 		if (msp->ms_sm == NULL)
 			continue;
 
 		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 			mg_hist[i + ashift] +=
 			    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 
 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 
 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 static void
 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		mg->mg_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 void
 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 
 		mg->mg_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	ASSERT(msp->ms_group == NULL);
 	mutex_enter(&mg->mg_lock);
 	msp->ms_group = mg;
 	msp->ms_weight = 0;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
 
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_add(mg, msp);
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_remove(mg, msp);
 	mutex_exit(&msp->ms_lock);
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
 	msp->ms_group = NULL;
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	/*
 	 * Although in principle the weight can be any value, in
 	 * practice we do not use values in the range [1, 511].
 	 */
 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
 	msp->ms_weight = weight;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Calculate the fragmentation for a given metaslab group. We can use
  * a simple average here since all metaslabs within the group must have
  * the same size. The return value will be a value between 0 and 100
  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
  * group have a fragmentation metric.
  */
 uint64_t
 metaslab_group_fragmentation(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	uint64_t fragmentation = 0;
 	uint64_t valid_ms = 0;
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 			continue;
 
 		valid_ms++;
 		fragmentation += msp->ms_fragmentation;
 	}
 
 	if (valid_ms <= vd->vdev_ms_count / 2)
 		return (ZFS_FRAG_INVALID);
 
 	fragmentation /= valid_ms;
 	ASSERT3U(fragmentation, <=, 100);
 	return (fragmentation);
 }
 
 /*
  * Determine if a given metaslab group should skip allocations. A metaslab
  * group should avoid allocations if its free capacity is less than the
  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
  * that can still handle allocations.
  */
 static boolean_t
 metaslab_group_allocatable(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	metaslab_class_t *mc = mg->mg_class;
 
 	/*
 	 * We use two key metrics to determine if a metaslab group is
 	 * considered allocatable -- free space and fragmentation. If
 	 * the free space is greater than the free space threshold and
 	 * the fragmentation is less than the fragmentation threshold then
 	 * consider the group allocatable. There are two case when we will
 	 * not consider these key metrics. The first is if the group is
 	 * associated with a slog device and the second is if all groups
 	 * in this metaslab class have already been consider ineligible
 	 * for allocations.
 	 */
 	return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) ||
 	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
 }
 
 /*
  * ==========================================================================
  * Range tree callbacks
  * ==========================================================================
  */
 
 /*
  * Comparison function for the private size-ordered tree. Tree is sorted
  * by size, larger sizes at the end of the tree.
  */
 static int
 metaslab_rangesize_compare(const void *x1, const void *x2)
 {
 	const range_seg_t *r1 = x1;
 	const range_seg_t *r2 = x2;
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	if (rs_size1 < rs_size2)
 		return (-1);
 	if (rs_size1 > rs_size2)
 		return (1);
 
 	if (r1->rs_start < r2->rs_start)
 		return (-1);
 
 	if (r1->rs_start > r2->rs_start)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Create any block allocator specific components. The current allocators
  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
  */
 static void
 metaslab_rt_create(range_tree_t *rt, void *arg)
 {
 	metaslab_t *msp = arg;
 
 	ASSERT3P(rt->rt_arg, ==, msp);
 	ASSERT(msp->ms_tree == NULL);
 
 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
 /*
  * Destroy the block allocator specific components.
  */
 static void
 metaslab_rt_destroy(range_tree_t *rt, void *arg)
 {
 	metaslab_t *msp = arg;
 
 	ASSERT3P(rt->rt_arg, ==, msp);
 	ASSERT3P(msp->ms_tree, ==, rt);
 	ASSERT0(avl_numnodes(&msp->ms_size_tree));
 
 	avl_destroy(&msp->ms_size_tree);
 }
 
 static void
 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	metaslab_t *msp = arg;
 
 	ASSERT3P(rt->rt_arg, ==, msp);
 	ASSERT3P(msp->ms_tree, ==, rt);
 	VERIFY(!msp->ms_condensing);
 	avl_add(&msp->ms_size_tree, rs);
 }
 
 static void
 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	metaslab_t *msp = arg;
 
 	ASSERT3P(rt->rt_arg, ==, msp);
 	ASSERT3P(msp->ms_tree, ==, rt);
 	VERIFY(!msp->ms_condensing);
 	avl_remove(&msp->ms_size_tree, rs);
 }
 
 static void
 metaslab_rt_vacate(range_tree_t *rt, void *arg)
 {
 	metaslab_t *msp = arg;
 
 	ASSERT3P(rt->rt_arg, ==, msp);
 	ASSERT3P(msp->ms_tree, ==, rt);
 
 	/*
 	 * Normally one would walk the tree freeing nodes along the way.
 	 * Since the nodes are shared with the range trees we can avoid
 	 * walking all nodes and just reinitialize the avl tree. The nodes
 	 * will be freed by the range tree, so we don't want to free them here.
 	 */
 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
 static range_tree_ops_t metaslab_rt_ops = {
 	metaslab_rt_create,
 	metaslab_rt_destroy,
 	metaslab_rt_add,
 	metaslab_rt_remove,
 	metaslab_rt_vacate
 };
 
 /*
  * ==========================================================================
  * Metaslab block operations
  * ==========================================================================
  */
 
 /*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
 metaslab_block_maxsize(metaslab_t *msp)
 {
 	avl_tree_t *t = &msp->ms_size_tree;
 	range_seg_t *rs;
 
 	if (t == NULL || (rs = avl_last(t)) == NULL)
 		return (0ULL);
 
 	return (rs->rs_end - rs->rs_start);
 }
 
 uint64_t
 metaslab_block_alloc(metaslab_t *msp, uint64_t size)
 {
 	uint64_t start;
 	range_tree_t *rt = msp->ms_tree;
 
 	VERIFY(!msp->ms_condensing);
 
 	start = msp->ms_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
 		vdev_t *vd = msp->ms_group->mg_vd;
 
 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 		range_tree_remove(rt, start, size);
 	}
 	return (start);
 }
 
 /*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
  */
 
 /*
  * This is a helper function that can be used by the allocator to find
  * a suitable block to allocate. This will search the specified AVL
  * tree looking for a block that matches the specified criteria.
  */
 static uint64_t
 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
     uint64_t align)
 {
 	range_seg_t *rs, rsearch;
 	avl_index_t where;
 
 	rsearch.rs_start = *cursor;
 	rsearch.rs_end = *cursor + size;
 
 	rs = avl_find(t, &rsearch, &where);
 	if (rs == NULL)
 		rs = avl_nearest(t, where, AVL_AFTER);
 
 	while (rs != NULL) {
 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
 
 		if (offset + size <= rs->rs_end) {
 			*cursor = offset + size;
 			return (offset);
 		}
 		rs = AVL_NEXT(t, rs);
 	}
 
 	/*
 	 * If we know we've searched the whole map (*cursor == 0), give up.
 	 * Otherwise, reset the cursor to the beginning and try again.
 	 */
 	if (*cursor == 0)
 		return (-1ULL);
 
 	*cursor = 0;
 	return (metaslab_block_picker(t, cursor, size, align));
 }
 
 /*
  * ==========================================================================
  * The first-fit block allocator
  * ==========================================================================
  */
 static uint64_t
 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 {
 	/*
 	 * Find the largest power of 2 block size that evenly divides the
 	 * requested size. This is used to try to allocate blocks with similar
 	 * alignment from the same area of the metaslab (i.e. same cursor
 	 * bucket) but it does not guarantee that other allocations sizes
 	 * may exist in the same region.
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	avl_tree_t *t = &msp->ms_tree->rt_root;
 
 	return (metaslab_block_picker(t, cursor, size, align));
 }
 
 static metaslab_ops_t metaslab_ff_ops = {
 	metaslab_ff_alloc
 };
 
 /*
  * ==========================================================================
  * Dynamic block allocator -
  * Uses the first fit allocation scheme until space get low and then
  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
  * ==========================================================================
  */
 static uint64_t
 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 {
 	/*
 	 * Find the largest power of 2 block size that evenly divides the
 	 * requested size. This is used to try to allocate blocks with similar
 	 * alignment from the same area of the metaslab (i.e. same cursor
 	 * bucket) but it does not guarantee that other allocations sizes
 	 * may exist in the same region.
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	range_tree_t *rt = msp->ms_tree;
 	avl_tree_t *t = &rt->rt_root;
 	uint64_t max_size = metaslab_block_maxsize(msp);
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
 	if (max_size < size)
 		return (-1ULL);
 
 	/*
 	 * If we're running low on space switch to using the size
 	 * sorted AVL tree (best-fit).
 	 */
 	if (max_size < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
 		t = &msp->ms_size_tree;
 		*cursor = 0;
 	}
 
 	return (metaslab_block_picker(t, cursor, size, 1ULL));
 }
 
 static metaslab_ops_t metaslab_df_ops = {
 	metaslab_df_alloc
 };
 
 /*
  * ==========================================================================
  * Cursor fit block allocator -
  * Select the largest region in the metaslab, set the cursor to the beginning
  * of the range and the cursor_end to the end of the range. As allocations
  * are made advance the cursor. Continue allocating from the cursor until
  * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
 	range_tree_t *rt = msp->ms_tree;
 	avl_tree_t *t = &msp->ms_size_tree;
 	uint64_t *cursor = &msp->ms_lbas[0];
 	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 
 	ASSERT3U(*cursor_end, >=, *cursor);
 
 	if ((*cursor + size) > *cursor_end) {
 		range_seg_t *rs;
 
 		rs = avl_last(&msp->ms_size_tree);
 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
 			return (-1ULL);
 
 		*cursor = rs->rs_start;
 		*cursor_end = rs->rs_end;
 	}
 
 	offset = *cursor;
 	*cursor += size;
 
 	return (offset);
 }
 
 static metaslab_ops_t metaslab_cf_ops = {
 	metaslab_cf_alloc
 };
 
 /*
  * ==========================================================================
  * New dynamic fit allocator -
  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
  * contiguous blocks. If no region is found then just use the largest segment
  * that remains.
  * ==========================================================================
  */
 
 /*
  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
  * to request from the allocator.
  */
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
 	avl_tree_t *t = &msp->ms_tree->rt_root;
 	avl_index_t where;
 	range_seg_t *rs, rsearch;
 	uint64_t hbit = highbit64(size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 	uint64_t max_size = metaslab_block_maxsize(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
 	if (max_size < size)
 		return (-1ULL);
 
 	rsearch.rs_start = *cursor;
 	rsearch.rs_end = *cursor + size;
 
 	rs = avl_find(t, &rsearch, &where);
 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
 		t = &msp->ms_size_tree;
 
 		rsearch.rs_start = 0;
 		rsearch.rs_end = MIN(max_size,
 		    1ULL << (hbit + metaslab_ndf_clump_shift));
 		rs = avl_find(t, &rsearch, &where);
 		if (rs == NULL)
 			rs = avl_nearest(t, where, AVL_AFTER);
 		ASSERT(rs != NULL);
 	}
 
 	if ((rs->rs_end - rs->rs_start) >= size) {
 		*cursor = rs->rs_start + size;
 		return (rs->rs_start);
 	}
 	return (-1ULL);
 }
 
 static metaslab_ops_t metaslab_ndf_ops = {
 	metaslab_ndf_alloc
 };
 
 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 
 /*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
  */
 
 /*
  * Wait for any in-progress metaslab loads to complete.
  */
 void
 metaslab_load_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_loading) {
 		ASSERT(!msp->ms_loaded);
 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 	}
 }
 
 int
 metaslab_load(metaslab_t *msp)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(!msp->ms_loaded);
 	ASSERT(!msp->ms_loading);
 
 	msp->ms_loading = B_TRUE;
 
 	/*
 	 * If the space map has not been allocated yet, then treat
 	 * all the space in the metaslab as free and add it to the
 	 * ms_tree.
 	 */
 	if (msp->ms_sm != NULL)
 		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
 	else
 		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
 
 	msp->ms_loaded = (error == 0);
 	msp->ms_loading = B_FALSE;
 
 	if (msp->ms_loaded) {
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			range_tree_walk(msp->ms_defertree[t],
 			    range_tree_remove, msp->ms_tree);
 		}
 	}
 	cv_broadcast(&msp->ms_load_cv);
 	return (error);
 }
 
 void
 metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	range_tree_vacate(msp->ms_tree, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 }
 
 int
 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
     metaslab_t **msp)
 {
 	vdev_t *vd = mg->mg_vd;
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 	metaslab_t *ms;
 	int error;
 
 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
 
 	/*
 	 * We only open space map objects that already exist. All others
 	 * will be opened when we finally allocate an object for it.
 	 */
 	if (object != 0) {
 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
 		    ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
 
 		if (error != 0) {
 			kmem_free(ms, sizeof (metaslab_t));
 			return (error);
 		}
 
 		ASSERT(ms->ms_sm != NULL);
 	}
 
 	/*
 	 * We create the main range tree here, but we don't create the
 	 * alloctree and freetree until metaslab_sync_done().  This serves
 	 * two purposes: it allows metaslab_sync_done() to detect the
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
 	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
 	metaslab_group_add(mg, ms);
 
 	ms->ms_fragmentation = metaslab_fragmentation(ms);
 	ms->ms_ops = mg->mg_class->mc_ops;
 
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
 	 * If we're adding space to an existing pool, the new space
 	 * does not become available until after this txg has synced.
 	 */
 	if (txg <= TXG_INITIAL)
 		metaslab_sync_done(ms, 0);
 
 	/*
 	 * If metaslab_debug_load is set and we're initializing a metaslab
 	 * that has an allocated space_map object then load the its space
 	 * map so that can verify frees.
 	 */
 	if (metaslab_debug_load && ms->ms_sm != NULL) {
 		mutex_enter(&ms->ms_lock);
 		VERIFY0(metaslab_load(ms));
 		mutex_exit(&ms->ms_lock);
 	}
 
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
 	}
 
 	*msp = ms;
 
 	return (0);
 }
 
 void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 
 	VERIFY(msp->ms_group == NULL);
 	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
 	    0, -msp->ms_size);
 	space_map_close(msp->ms_sm);
 
 	metaslab_unload(msp);
 	range_tree_destroy(msp->ms_tree);
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		range_tree_destroy(msp->ms_alloctree[t]);
 		range_tree_destroy(msp->ms_freetree[t]);
 	}
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_destroy(msp->ms_defertree[t]);
 	}
 
 	ASSERT0(msp->ms_deferspace);
 
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
 #define	FRAGMENTATION_TABLE_SIZE	17
 
 /*
  * This table defines a segment size based fragmentation metric that will
  * allow each metaslab to derive its own fragmentation value. This is done
  * by calculating the space in each bucket of the spacemap histogram and
  * multiplying that by the fragmetation metric in this table. Doing
  * this for all buckets and dividing it by the total amount of free
  * space in this metaslab (i.e. the total free space in all buckets) gives
  * us the fragmentation metric. This means that a high fragmentation metric
  * equates to most of the free space being comprised of small segments.
  * Conversely, if the metric is low, then most of the free space is in
  * large segments. A 10% change in fragmentation equates to approximately
  * double the number of segments.
  *
  * This table defines 0% fragmented space using 16MB segments. Testing has
  * shown that segments that are greater than or equal to 16MB do not suffer
  * from drastic performance problems. Using this value, we derive the rest
  * of the table. Since the fragmentation value is never stored on disk, it
  * is possible to change these calculations in the future.
  */
 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
 	100,	/* 512B	*/
 	100,	/* 1K	*/
 	98,	/* 2K	*/
 	95,	/* 4K	*/
 	90,	/* 8K	*/
 	80,	/* 16K	*/
 	70,	/* 32K	*/
 	60,	/* 64K	*/
 	50,	/* 128K	*/
 	40,	/* 256K	*/
 	30,	/* 512K	*/
 	20,	/* 1M	*/
 	15,	/* 2M	*/
 	10,	/* 4M	*/
 	5,	/* 8M	*/
 	0	/* 16M	*/
 };
 
 /*
  * Calclate the metaslab's fragmentation metric. A return value
  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
  * not support this metric. Otherwise, the return value should be in the
  * range [0, 100].
  */
 static uint64_t
 metaslab_fragmentation(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t fragmentation = 0;
 	uint64_t total = 0;
 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
 
 	if (!feature_enabled)
 		return (ZFS_FRAG_INVALID);
 
 	/*
 	 * A null space map means that the entire metaslab is free
 	 * and thus is not fragmented.
 	 */
 	if (msp->ms_sm == NULL)
 		return (0);
 
 	/*
 	 * If this metaslab's space_map has not been upgraded, flag it
 	 * so that we upgrade next time we encounter it.
 	 */
 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
 		uint64_t txg = spa_syncing_txg(spa);
 		vdev_t *vd = msp->ms_group->mg_vd;
 
 		if (spa_writeable(spa)) {
 			msp->ms_condense_wanted = B_TRUE;
 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
 			    "msp %p, vd %p", txg, msp, vd);
 		}
 		return (ZFS_FRAG_INVALID);
 	}
 
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		uint64_t space = 0;
 		uint8_t shift = msp->ms_sm->sm_shift;
 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
 		    FRAGMENTATION_TABLE_SIZE - 1);
 
 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
 			continue;
 
 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
 		total += space;
 
 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
 		fragmentation += space * zfs_frag_table[idx];
 	}
 
 	if (total > 0)
 		fragmentation /= total;
 	ASSERT3U(fragmentation, <=, 100);
 	return (fragmentation);
 }
 
 /*
  * Compute a weight -- a selection preference value -- for the given metaslab.
  * This is based on the amount of free space, the level of fragmentation,
  * the LBA range, and whether the metaslab is loaded.
  */
 static uint64_t
 metaslab_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * This vdev is in the process of being removed so there is nothing
 	 * for us to do here.
 	 */
 	if (vd->vdev_removing) {
 		ASSERT0(space_map_allocated(msp->ms_sm));
 		ASSERT0(vd->vdev_ms_shift);
 		return (0);
 	}
 
 	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
 	space = msp->ms_size - space_map_allocated(msp->ms_sm);
 
 	msp->ms_fragmentation = metaslab_fragmentation(msp);
 	if (metaslab_fragmentation_factor_enabled &&
 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
 		/*
 		 * Use the fragmentation information to inversely scale
 		 * down the baseline weight. We need to ensure that we
 		 * don't exclude this metaslab completely when it's 100%
 		 * fragmented. To avoid this we reduce the fragmented value
 		 * by 1.
 		 */
 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
 
 		/*
 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
 		 * this metaslab again. The fragmentation metric may have
 		 * decreased the space to something smaller than
 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
 		 * so that we can consume any remaining space.
 		 */
 		if (space > 0 && space < SPA_MINBLOCKSIZE)
 			space = SPA_MINBLOCKSIZE;
 	}
 	weight = space;
 
 	/*
 	 * Modern disks have uniform bit density and constant angular velocity.
 	 * Therefore, the outer recording zones are faster (higher bandwidth)
 	 * than the inner zones by the ratio of outer to inner track diameter,
 	 * which is typically around 2:1.  We account for this by assigning
 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
 	if (metaslab_lba_weighting_enabled) {
 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 		ASSERT(weight >= space && weight <= 2 * space);
 	}
 
 	/*
 	 * If this metaslab is one we're actively using, adjust its
 	 * weight to make it preferable to any inactive metaslab so
 	 * we'll polish it off. If the fragmentation on this metaslab
 	 * has exceed our threshold, then don't mark it active.
 	 */
 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 	}
 
 	return (weight);
 }
 
 static int
 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		metaslab_load_wait(msp);
 		if (!msp->ms_loaded) {
 			int error = metaslab_load(msp);
 			if (error) {
 				metaslab_group_sort(msp->ms_group, msp, 0);
 				return (error);
 			}
 		}
 
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
 }
 
 static void
 metaslab_passivate(metaslab_t *msp, uint64_t size)
 {
 	/*
 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
 	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
 static void
 metaslab_preload(void *arg)
 {
 	metaslab_t *msp = arg;
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
 	mutex_enter(&msp->ms_lock);
 	metaslab_load_wait(msp);
 	if (!msp->ms_loaded)
 		(void) metaslab_load(msp);
 
 	/*
 	 * Set the ms_access_txg value so that we don't unload it right away.
 	 */
 	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_preload(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	int m = 0;
 
 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
 		taskq_wait(mg->mg_taskq);
 		return;
 	}
 
 	mutex_enter(&mg->mg_lock);
 	/*
 	 * Load the next potential metaslabs
 	 */
 	msp = avl_first(t);
 	while (msp != NULL) {
 		metaslab_t *msp_next = AVL_NEXT(t, msp);
 
 		/*
 		 * We preload only the maximum number of metaslabs specified
 		 * by metaslab_preload_limit. If a metaslab is being forced
 		 * to condense then we preload it too. This will ensure
 		 * that force condensing happens in the next txg.
 		 */
 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
 			msp = msp_next;
 			continue;
 		}
 
 		/*
 		 * We must drop the metaslab group lock here to preserve
 		 * lock ordering with the ms_lock (when grabbing both
 		 * the mg_lock and the ms_lock, the ms_lock must be taken
 		 * first).  As a result, it is possible that the ordering
 		 * of the metaslabs within the avl tree may change before
 		 * we reacquire the lock. The metaslab cannot be removed from
 		 * the tree while we're in syncing context so it is safe to
 		 * drop the mg_lock here. If the metaslabs are reordered
 		 * nothing will break -- we just may end up loading a
 		 * less than optimal one.
 		 */
 		mutex_exit(&mg->mg_lock);
 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
 		    msp, TQ_SLEEP) != 0);
 		mutex_enter(&mg->mg_lock);
 		msp = msp_next;
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Determine if the space map's on-disk footprint is past our tolerance
  * for inefficiency. We would like to use the following criteria to make
  * our decision:
  *
  * 1. The size of the space map object should not dramatically increase as a
  * result of writing out the free space range tree.
  *
  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
  * times the size than the free space range tree representation
  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
  *
  * 3. The on-disk size of the space map should actually decrease.
  *
  * Checking the first condition is tricky since we don't want to walk
  * the entire AVL tree calculating the estimated on-disk size. Instead we
  * use the size-ordered range tree in the metaslab and calculate the
  * size required to write out the largest segment in our free tree. If the
  * size required to represent that segment on disk is larger than the space
  * map object then we avoid condensing this map.
  *
  * To determine the second criterion we use a best-case estimate and assume
  * each segment can be represented on-disk as a single 64-bit entry. We refer
  * to this best-case estimate as the space map's minimal form.
  *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
  * zfs_metaslab_condense_block_threshold - we only condense if the space used
  * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	range_seg_t *rs;
 	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
 	dmu_object_info_t doi;
 	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
 	/*
 	 * Use the ms_size_tree range tree, which is ordered by size, to
 	 * obtain the largest segment in the free tree. We always condense
 	 * metaslabs that are empty and metaslabs for which a condense
 	 * request has been made.
 	 */
 	rs = avl_last(&msp->ms_size_tree);
 	if (rs == NULL || msp->ms_condense_wanted)
 		return (B_TRUE);
 
 	/*
 	 * Calculate the number of 64-bit entries this segment would
 	 * require when written to disk. If this single segment would be
 	 * larger on-disk than the entire current on-disk structure, then
 	 * clearly condensing will increase the on-disk structure size.
 	 */
 	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
 	entries = size / (MIN(size, SM_RUN_MAX));
 	segsz = entries * sizeof (uint64_t);
 
 	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
 	object_size = space_map_length(msp->ms_sm);
 
 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
 	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
 
 	return (segsz <= object_size &&
 	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
  * Condense the on-disk space map representation to its minimized form.
  * The minimized form consists of a small number of allocations followed by
  * the entries of the free range tree.
  */
 static void
 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
 	range_tree_t *condense_tree;
 	space_map_t *sm = msp->ms_sm;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(msp->ms_loaded);
 
 
 	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
 	    msp->ms_group->mg_vd->vdev_spa->spa_name,
 	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
 
 	/*
 	 * Create an range tree that is 100% allocated. We remove segments
 	 * that have been freed in this txg, any deferred frees that exist,
 	 * and any allocation in the future. Removing segments should be
 	 * a relatively inexpensive operation since we expect these trees to
 	 * have a small number of nodes.
 	 */
 	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
 
 	/*
 	 * Remove what's been freed in this txg from the condense_tree.
 	 * Since we're in sync_pass 1, we know that all the frees from
 	 * this txg are in the freetree.
 	 */
 	range_tree_walk(freetree, range_tree_remove, condense_tree);
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defertree[t],
 		    range_tree_remove, condense_tree);
 	}
 
 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
 		    range_tree_remove, condense_tree);
 	}
 
 	/*
 	 * We're about to drop the metaslab's lock thus allowing
 	 * other consumers to change it's content. Set the
 	 * metaslab's ms_condensing flag to ensure that
 	 * allocations on this metaslab do not occur while we're
 	 * in the middle of committing it to disk. This is only critical
 	 * for the ms_tree as all other range trees use per txg
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
 	space_map_truncate(sm, tx);
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * While we would ideally like to create a space_map representation
 	 * that consists only of allocation records, doing so can be
 	 * prohibitively expensive because the in-core free tree can be
 	 * large, and therefore computationally expensive to subtract
 	 * from the condense_tree. Instead we sync out two trees, a cheap
 	 * allocation only tree followed by the in-core free tree. While not
 	 * optimal, this is typically close to optimal, and much cheaper to
 	 * compute.
 	 */
 	space_map_write(sm, condense_tree, SM_ALLOC, tx);
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
 
 	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
 	msp->ms_condensing = B_FALSE;
 }
 
 /*
  * Write a metaslab to disk in the context of the specified transaction group.
  */
 void
 metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
 	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
 	range_tree_t **freed_tree =
 	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(msp->ms_sm);
 
 	ASSERT(!vd->vdev_ishole);
 
 	/*
 	 * This metaslab has just been added so there's no work to do now.
 	 */
 	if (*freetree == NULL) {
 		ASSERT3P(alloctree, ==, NULL);
 		return;
 	}
 
 	ASSERT3P(alloctree, !=, NULL);
 	ASSERT3P(*freetree, !=, NULL);
 	ASSERT3P(*freed_tree, !=, NULL);
 
 	/*
 	 * Normally, we don't want to process a metaslab if there
 	 * are no allocations or frees to perform. However, if the metaslab
 	 * is being forced to condense we need to let it through.
 	 */
 	if (range_tree_space(alloctree) == 0 &&
 	    range_tree_space(*freetree) == 0 &&
 	    !msp->ms_condense_wanted)
 		return;
 
 	/*
 	 * The only state that can actually be changing concurrently with
 	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
 	 * be modifying this txg's alloctree, freetree, freed_tree, or
 	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
 	 * space_map ASSERTs. We drop it whenever we call into the DMU,
 	 * because the DMU can call down to us (e.g. via zio_free()) at
 	 * any time.
 	 */
 
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	if (msp->ms_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
 		    &msp->ms_lock));
 		ASSERT(msp->ms_sm != NULL);
 	}
 
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * Note: metaslab_condense() clears the space_map's histogram.
 	 * Therefore we must verify and remove this histogram before
 	 * condensing.
 	 */
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);
 
 	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
 	    metaslab_should_condense(msp)) {
 		metaslab_condense(msp, txg, tx);
 	} else {
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
 		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
 	}
 
 	if (msp->ms_loaded) {
 		/*
 		 * When the space map is loaded, we have an accruate
 		 * histogram in the range tree. This gives us an opportunity
 		 * to bring the space map's histogram up-to-date so we clear
 		 * it first before updating it.
 		 */
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
 	} else {
 		/*
 		 * Since the space map is not loaded we simply update the
 		 * exisiting histogram with what was freed in this txg. This
 		 * means that the on-disk histogram may not have an accurate
 		 * view of the free space but it's close enough to allow
 		 * us to make allocation decisions.
 		 */
 		space_map_histogram_add(msp->ms_sm, *freetree, tx);
 	}
 	metaslab_group_histogram_add(mg, msp);
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 
 	/*
 	 * For sync pass 1, we avoid traversing this txg's free range tree
 	 * and instead will just swap the pointers for freetree and
 	 * freed_tree. We can safely do this since the freed_tree is
 	 * guaranteed to be empty on the initial pass.
 	 */
 	if (spa_sync_pass(spa) == 1) {
 		range_tree_swap(freetree, freed_tree);
 	} else {
 		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
 	}
 	range_tree_vacate(alloctree, NULL, NULL);
 
 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
 
 	mutex_exit(&msp->ms_lock);
 
 	if (object != space_map_object(msp->ms_sm)) {
 		object = space_map_object(msp->ms_sm);
 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &object, tx);
 	}
 	dmu_tx_commit(tx);
 }
 
 /*
  * Called after a transaction group has completely synced to mark
  * all of the metaslab's free space as usable.
  */
 void
 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	range_tree_t **freed_tree;
 	range_tree_t **defer_tree;
 	int64_t alloc_delta, defer_delta;
 
 	ASSERT(!vd->vdev_ishole);
 
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * If this metaslab is just becoming available, initialize its
 	 * alloctrees, freetrees, and defertree and add its capacity to
 	 * the vdev.
 	 */
 	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
 		for (int t = 0; t < TXG_SIZE; t++) {
 			ASSERT(msp->ms_alloctree[t] == NULL);
 			ASSERT(msp->ms_freetree[t] == NULL);
 
 			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
 			    &msp->ms_lock);
 			msp->ms_freetree[t] = range_tree_create(NULL, msp,
 			    &msp->ms_lock);
 		}
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			ASSERT(msp->ms_defertree[t] == NULL);
 
 			msp->ms_defertree[t] = range_tree_create(NULL, msp,
 			    &msp->ms_lock);
 		}
 
 		vdev_space_update(vd, 0, 0, msp->ms_size);
 	}
 
 	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
 	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
 
 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
 	defer_delta = range_tree_space(*freed_tree) -
 	    range_tree_space(*defer_tree);
 
 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
 
 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
 
 	/*
 	 * If there's a metaslab_load() in progress, wait for it to complete
 	 * so that we have a consistent view of the in-core space map.
 	 */
 	metaslab_load_wait(msp);
 
 	/*
 	 * Move the frees from the defer_tree back to the free
 	 * range tree (if it's loaded). Swap the freed_tree and the
 	 * defer_tree -- this is safe to do because we've just emptied out
 	 * the defer_tree.
 	 */
 	range_tree_vacate(*defer_tree,
 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
 	range_tree_swap(freed_tree, defer_tree);
 
 	space_map_update(msp->ms_sm);
 
 	msp->ms_deferspace += defer_delta;
 	ASSERT3S(msp->ms_deferspace, >=, 0);
 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
 	if (msp->ms_deferspace != 0) {
 		/*
 		 * Keep syncing this metaslab until all deferred frees
 		 * are back in circulation.
 		 */
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}
 
 	if (msp->ms_loaded && msp->ms_access_txg < txg) {
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 			VERIFY0(range_tree_space(
 			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
 		}
 
 		if (!metaslab_debug_unload)
 			metaslab_unload(msp);
 	}
 
 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	metaslab_group_alloc_update(mg);
 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 	/*
 	 * Preload the next potential metaslabs
 	 */
 	metaslab_group_preload(mg);
 }
 
 static uint64_t
 metaslab_distance(metaslab_t *msp, dva_t *dva)
 {
 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
 	uint64_t start = msp->ms_id;
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (1ULL << 63);
 
 	if (offset < start)
 		return ((start - offset) << ms_shift);
 	if (offset > start)
 		return ((offset - start) << ms_shift);
 	return (0);
 }
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
     uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	uint64_t activation_weight;
 	uint64_t target_distance;
 	int i;
 
 	activation_weight = METASLAB_WEIGHT_PRIMARY;
 	for (i = 0; i < d; i++) {
 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
 			break;
 		}
 	}
 
 	for (;;) {
 		boolean_t was_active;
 
 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
 			if (msp->ms_weight < asize) {
 				spa_dbgmsg(spa, "%s: failed to meet weight "
 				    "requirement: vdev %llu, txg %llu, mg %p, "
 				    "msp %p, psize %llu, asize %llu, "
 				    "weight %llu", spa_name(spa),
 				    mg->mg_vd->vdev_id, txg,
 				    mg, msp, psize, asize, msp->ms_weight);
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
 
 			/*
 			 * If the selected metaslab is condensing, skip it.
 			 */
 			if (msp->ms_condensing)
 				continue;
 
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
 
 			target_distance = min_distance +
 			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
 			    min_distance >> 1);
 
 			for (i = 0; i < d; i++)
 				if (metaslab_distance(msp, &dva[i]) <
 				    target_distance)
 					break;
 			if (i == d)
 				break;
 		}
 		mutex_exit(&mg->mg_lock);
 		if (msp == NULL)
 			return (-1ULL);
 
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
 		if (msp->ms_weight < asize || (was_active &&
 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
 			metaslab_passivate(msp,
 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		if (metaslab_activate(msp, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * If this metaslab is currently condensing then pick again as
 		 * we can't manipulate this metaslab until it's committed
 		 * to disk.
 		 */
 		if (msp->ms_condensing) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
 			break;
 
 		metaslab_passivate(msp, metaslab_block_maxsize(msp));
 		mutex_exit(&msp->ms_lock);
 	}
 
 	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
 	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
 	msp->ms_access_txg = txg + metaslab_unload_delay;
 
 	mutex_exit(&msp->ms_lock);
 
 	return (offset);
 }
 
 /*
  * Allocate a block for the specified i/o.
  */
 static int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
 {
 	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
 	int dshift = 3;
 	int all_zero;
 	int zio_lock = B_FALSE;
 	boolean_t allocatable;
 	uint64_t offset = -1ULL;
 	uint64_t asize;
 	uint64_t distance;
 
 	ASSERT(!DVA_IS_VALID(&dva[d]));
 
 	/*
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 */
 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
 		return (SET_ERROR(ENOSPC));
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mc_rotor or mc_aliquot because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
 	 * If we are doing ditto or log blocks, try to spread them across
 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
 	 * allocated all of our ditto blocks, then try and spread them out on
 	 * that vdev as much as possible.  If it turns out to not be possible,
 	 * gradually lower our standards until anything becomes acceptable.
 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
 	 * gives us hope of containing our fault domains to something we're
 	 * able to reason about.  Otherwise, any two top-level vdev failures
 	 * will guarantee the loss of data.  With consecutive allocation,
 	 * only two adjacent top-level vdev failures will result in data loss.
 	 *
 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
 	 * ourselves on the same vdev as our gang block header.  That
 	 * way, we can hope for locality in vdev_cache, plus it makes our
 	 * fault domains something tractable.
 	 */
 	if (hintdva) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
 
 		/*
 		 * It's possible the vdev we're using as the hint no
 		 * longer exists (i.e. removed). Consult the rotor when
 		 * all else fails.
 		 */
 		if (vd != NULL) {
 			mg = vd->vdev_mg;
 
 			if (flags & METASLAB_HINTBP_AVOID &&
 			    mg->mg_next != NULL)
 				mg = mg->mg_next;
 		} else {
 			mg = mc->mc_rotor;
 		}
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
 	} else {
 		mg = mc->mc_rotor;
 	}
 
 	/*
 	 * If the hint put us into the wrong metaslab class, or into a
 	 * metaslab group that has been passivated, just follow the rotor.
 	 */
 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
 		mg = mc->mc_rotor;
 
 	rotor = mg;
 top:
 	all_zero = B_TRUE;
 	do {
 		ASSERT(mg->mg_activation_count == 1);
 
 		vd = mg->mg_vd;
 
 		/*
 		 * Don't allocate from faulted devices.
 		 */
 		if (zio_lock) {
 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 			allocatable = vdev_allocatable(vd);
 			spa_config_exit(spa, SCL_ZIO, FTAG);
 		} else {
 			allocatable = vdev_allocatable(vd);
 		}
 
 		/*
 		 * Determine if the selected metaslab group is eligible
 		 * for allocations. If we're ganging or have requested
 		 * an allocation for the smallest gang block size
 		 * then we don't want to avoid allocating to the this
 		 * metaslab group. If we're in this condition we should
 		 * try to allocate from any device possible so that we
 		 * don't inadvertently return ENOSPC and suspend the pool
 		 * even though space is still available.
 		 */
 		if (allocatable && CAN_FASTGANG(flags) &&
 		    psize > SPA_GANGBLOCKSIZE)
 			allocatable = metaslab_group_allocatable(mg);
 
 		if (!allocatable)
 			goto next;
 
 		/*
 		 * Avoid writing single-copy data to a failing vdev
 		 * unless the user instructs us that it is okay.
 		 */
 		if ((vd->vdev_stat.vs_write_errors > 0 ||
 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
 		    d == 0 && dshift == 3 && vd->vdev_children == 0) {
 			all_zero = B_FALSE;
 			goto next;
 		}
 
 		ASSERT(mg->mg_class == mc);
 
 		distance = vd->vdev_asize >> dshift;
 		if (distance <= (1ULL << vd->vdev_ms_shift))
 			distance = 0;
 		else
 			all_zero = B_FALSE;
 
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
 		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
 		    dva, d);
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
 			 * figure out whether the corresponding vdev is
 			 * over- or under-used relative to the pool,
 			 * and set an allocation bias to even it out.
 			 */
 			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;
 
 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
 
 				/*
 				 * Calculate how much more or less we should
 				 * try to allocate from this device during
 				 * this iteration around the rotor.
 				 * For example, if a device is 80% full
 				 * and the pool is 20% full then we should
 				 * reduce allocations by 60% on this device.
 				 *
 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
 				 *
 				 * This reduces allocations by 307K for this
 				 * iteration.
 				 */
 				mg->mg_bias = ((cu - vu) *
 				    (int64_t)mg->mg_aliquot) / 100;
 			} else if (!metaslab_bias_enabled) {
 				mg->mg_bias = 0;
 			}
 
 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mc->mc_rotor = mg->mg_next;
 				mc->mc_aliquot = 0;
 			}
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
 			DVA_SET_OFFSET(&dva[d], offset);
 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
 			DVA_SET_ASIZE(&dva[d], asize);
 
 			return (0);
 		}
 next:
 		mc->mc_rotor = mg->mg_next;
 		mc->mc_aliquot = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
 	if (!all_zero) {
 		dshift++;
 		ASSERT(dshift < 64);
 		goto top;
 	}
 
 	if (!allocatable && !zio_lock) {
 		dshift = 3;
 		zio_lock = B_TRUE;
 		goto top;
 	}
 
 	bzero(&dva[d], sizeof (dva_t));
 
 	return (SET_ERROR(ENOSPC));
 }
 
 /*
  * Free the block represented by DVA in the context of the specified
  * transaction group.
  */
 static void
 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd;
 	metaslab_t *msp;
 
 	ASSERT(DVA_IS_VALID(dva));
 
 	if (txg > spa_freeze_txg(spa))
 		return;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
 		    (u_longlong_t)vdev, (u_longlong_t)offset);
 		ASSERT(0);
 		return;
 	}
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 
 	mutex_enter(&msp->ms_lock);
 
 	if (now) {
 		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
 		    offset, size);
 
 		VERIFY(!msp->ms_condensing);
 		VERIFY3U(offset, >=, msp->ms_start);
 		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
 		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
 		    msp->ms_size);
 		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 		range_tree_add(msp->ms_tree, offset, size);
 	} else {
 		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
 		    offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
 }
 
 /*
  * Intent log support: upon opening the pool after a crash, notify the SPA
  * of blocks that the intent log has allocated for immediate write, but
  * which are still considered free by the SPA because the last transaction
  * group didn't commit yet.
  */
 static int
 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd;
 	metaslab_t *msp;
 	int error = 0;
 
 	ASSERT(DVA_IS_VALID(dva));
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
 		return (SET_ERROR(ENXIO));
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
 	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
 		error = SET_ERROR(ENOENT);
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
 	}
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
 	range_tree_remove(msp->ms_tree, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
 
 	return (0);
 }
 
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = hintbp->blk_dva;
 	int error = 0;
 
 	ASSERT(bp->blk_birth == 0);
 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
 	ASSERT(BP_GET_NDVAS(bp) == 0);
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
 		    txg, flags);
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
 				bzero(&dva[d], sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
 		}
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	BP_SET_BIRTH(bp, txg, txg);
 
 	return (0);
 }
 
 void
 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++)
 		metaslab_free_dva(spa, &dva[d], txg, now);
 
 	spa_config_exit(spa, SCL_FREE, FTAG);
 }
 
 int
 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 	int error = 0;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
 	if (txg != 0) {
 		/*
 		 * First do a dry run to make sure all DVAs are claimable,
 		 * so we don't have to unwind from partial failures below.
 		 */
 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
 			return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++)
 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
 			break;
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	ASSERT(error == 0 || txg == 0);
 
 	return (error);
 }
 
 void
 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 {
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		vdev_t *vd = vdev_lookup_top(spa, vdev);
 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 		if (msp->ms_loaded)
 			range_tree_verify(msp->ms_tree, offset, size);
 
 		for (int j = 0; j < TXG_SIZE; j++)
 			range_tree_verify(msp->ms_freetree[j], offset, size);
 		for (int j = 0; j < TXG_DEFER_SIZE; j++)
 			range_tree_verify(msp->ms_defertree[j], offset, size);
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	(revision 296519)
@@ -1,2012 +1,2013 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 iXsystems, Inc
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sunddi.h>
 #include <sys/sa_impl.h>
 #include <sys/dnode.h>
 #include <sys/errno.h>
 #include <sys/zfs_context.h>
 
 /*
  * ZFS System attributes:
  *
  * A generic mechanism to allow for arbitrary attributes
  * to be stored in a dnode.  The data will be stored in the bonus buffer of
  * the dnode and if necessary a special "spill" block will be used to handle
  * overflow situations.  The spill block will be sized to fit the data
  * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
  * spill block is stored at the end of the current bonus buffer.  Any
  * attributes that would be in the way of the blkptr_t will be relocated
  * into the spill block.
  *
  * Attribute registration:
  *
  * Stored persistently on a per dataset basis
  * a mapping between attribute "string" names and their actual attribute
  * numeric values, length, and byteswap function.  The names are only used
  * during registration.  All  attributes are known by their unique attribute
  * id value.  If an attribute can have a variable size then the value
  * 0 will be used to indicate this.
  *
  * Attribute Layout:
  *
  * Attribute layouts are a way to compactly store multiple attributes, but
  * without taking the overhead associated with managing each attribute
  * individually.  Since you will typically have the same set of attributes
  * stored in the same order a single table will be used to represent that
  * layout.  The ZPL for example will usually have only about 10 different
  * layouts (regular files, device files, symlinks,
  * regular files + scanstamp, files/dir with extended attributes, and then
  * you have the possibility of all of those minus ACL, because it would
  * be kicked out into the spill block)
  *
  * Layouts are simply an array of the attributes and their
  * ordering i.e. [0, 1, 4, 5, 2]
  *
  * Each distinct layout is given a unique layout number and that is whats
  * stored in the header at the beginning of the SA data buffer.
  *
  * A layout only covers a single dbuf (bonus or spill).  If a set of
  * attributes is split up between the bonus buffer and a spill buffer then
  * two different layouts will be used.  This allows us to byteswap the
  * spill without looking at the bonus buffer and keeps the on disk format of
  * the bonus and spill buffer the same.
  *
  * Adding a single attribute will cause the entire set of attributes to
  * be rewritten and could result in a new layout number being constructed
  * as part of the rewrite if no such layout exists for the new set of
  * attribues.  The new attribute will be appended to the end of the already
  * existing attributes.
  *
  * Both the attribute registration and attribute layout information are
  * stored in normal ZAP attributes.  Their should be a small number of
  * known layouts and the set of attributes is assumed to typically be quite
  * small.
  *
  * The registered attributes and layout "table" information is maintained
  * in core and a special "sa_os_t" is attached to the objset_t.
  *
  * A special interface is provided to allow for quickly applying
  * a large set of attributes at once.  sa_replace_all_by_template() is
  * used to set an array of attributes.  This is used by the ZPL when
  * creating a brand new file.  The template that is passed into the function
  * specifies the attribute, size for variable length attributes, location of
  * data and special "data locator" function if the data isn't in a contiguous
  * location.
  *
  * Byteswap implications:
  *
  * Since the SA attributes are not entirely self describing we can't do
  * the normal byteswap processing.  The special ZAP layout attribute and
  * attribute registration attributes define the byteswap function and the
  * size of the attributes, unless it is variable sized.
  * The normal ZFS byteswapping infrastructure assumes you don't need
  * to read any objects in order to do the necessary byteswapping.  Whereas
  * SA attributes can only be properly byteswapped if the dataset is opened
  * and the layout/attribute ZAP attributes are available.  Because of this
  * the SA attributes will be byteswapped when they are first accessed by
  * the SA code that will read the SA data.
  */
 
 typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
     uint16_t length, int length_idx, boolean_t, void *userp);
 
 static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
 static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
 static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
     void *data);
 static void sa_idx_tab_rele(objset_t *os, void *arg);
 static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
     int buflen);
 static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx);
 
 arc_byteswap_func_t *sa_bswap_table[] = {
 	byteswap_uint64_array,
 	byteswap_uint32_array,
 	byteswap_uint16_array,
 	byteswap_uint8_array,
 	zfs_acl_byteswap,
 };
 
 #define	SA_COPY_DATA(f, s, t, l) \
 	{ \
 		if (f == NULL) { \
 			if (l == 8) { \
 				*(uint64_t *)t = *(uint64_t *)s; \
 			} else if (l == 16) { \
 				*(uint64_t *)t = *(uint64_t *)s; \
 				*(uint64_t *)((uintptr_t)t + 8) = \
 				    *(uint64_t *)((uintptr_t)s + 8); \
 			} else { \
 				bcopy(s, t, l); \
 			} \
 		} else \
 			sa_copy_data(f, s, t, l); \
 	}
 
 /*
  * This table is fixed and cannot be changed.  Its purpose is to
  * allow the SA code to work with both old/new ZPL file systems.
  * It contains the list of legacy attributes.  These attributes aren't
  * stored in the "attribute" registry zap objects, since older ZPL file systems
  * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
  * use this static table.
  */
 sa_attr_reg_t sa_legacy_attrs[] = {
 	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
 	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
 	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
 	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
 	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
 	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
 	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
 	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
 	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
 	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
 	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
 	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
 	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
 	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
 	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
 	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
 };
 
 /*
  * This is only used for objects of type DMU_OT_ZNODE
  */
 sa_attr_type_t sa_legacy_zpl_layout[] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
 /*
  * Special dummy layout used for buffers with no attributes.
  */
 sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
 
 static int sa_legacy_attr_count = 16;
 static kmem_cache_t *sa_cache = NULL;
 
 /*ARGSUSED*/
 static int
 sa_cache_constructor(void *buf, void *unused, int kmflag)
 {
 	sa_handle_t *hdl = buf;
 
 	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 sa_cache_destructor(void *buf, void *unused)
 {
 	sa_handle_t *hdl = buf;
 	mutex_destroy(&hdl->sa_lock);
 }
 
 void
 sa_cache_init(void)
 {
 	sa_cache = kmem_cache_create("sa_cache",
 	    sizeof (sa_handle_t), 0, sa_cache_constructor,
 	    sa_cache_destructor, NULL, NULL, NULL, 0);
 }
 
 void
 sa_cache_fini(void)
 {
 	if (sa_cache)
 		kmem_cache_destroy(sa_cache);
 }
 
 static int
 layout_num_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = arg1;
 	const sa_lot_t *node2 = arg2;
 
 	if (node1->lot_num > node2->lot_num)
 		return (1);
 	else if (node1->lot_num < node2->lot_num)
 		return (-1);
 	return (0);
 }
 
 static int
 layout_hash_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = arg1;
 	const sa_lot_t *node2 = arg2;
 
 	if (node1->lot_hash > node2->lot_hash)
 		return (1);
 	if (node1->lot_hash < node2->lot_hash)
 		return (-1);
 	if (node1->lot_instance > node2->lot_instance)
 		return (1);
 	if (node1->lot_instance < node2->lot_instance)
 		return (-1);
 	return (0);
 }
 
 boolean_t
 sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
 {
 	int i;
 
 	if (count != tbf->lot_attr_count)
 		return (1);
 
 	for (i = 0; i != count; i++) {
 		if (attrs[i] != tbf->lot_attrs[i])
 			return (1);
 	}
 	return (0);
 }
 
 #define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
 
 static uint64_t
 sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
 {
 	int i;
 	uint64_t crc = -1ULL;
 
 	for (i = 0; i != attr_count; i++)
 		crc ^= SA_ATTR_HASH(attrs[i]);
 
 	return (crc);
 }
 
 static int
 sa_get_spill(sa_handle_t *hdl)
 {
 	int rc;
 	if (hdl->sa_spill == NULL) {
 		if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
 		    &hdl->sa_spill)) == 0)
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 	} else {
 		rc = 0;
 	}
 
 	return (rc);
 }
 
 /*
  * Main attribute lookup/update function
  * returns 0 for success or non zero for failures
  *
  * Operates on bulk array, first failure will abort further processing
  */
 int
 sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     sa_data_op_t data_op, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	int i;
 	int error = 0;
 	sa_buf_type_t buftypes;
 
 	buftypes = 0;
 
 	ASSERT(count > 0);
 	for (i = 0; i != count; i++) {
 		ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
 
 		bulk[i].sa_addr = NULL;
 		/* First check the bonus buffer */
 
 		if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
 		    hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
 			SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
 			    SA_GET_HDR(hdl, SA_BONUS),
 			    bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
 			if (tx && !(buftypes & SA_BONUS)) {
 				dmu_buf_will_dirty(hdl->sa_bonus, tx);
 				buftypes |= SA_BONUS;
 			}
 		}
 		if (bulk[i].sa_addr == NULL &&
 		    ((error = sa_get_spill(hdl)) == 0)) {
 			if (TOC_ATTR_PRESENT(
 			    hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
 				SA_ATTR_INFO(sa, hdl->sa_spill_tab,
 				    SA_GET_HDR(hdl, SA_SPILL),
 				    bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
 				if (tx && !(buftypes & SA_SPILL) &&
 				    bulk[i].sa_size == bulk[i].sa_length) {
 					dmu_buf_will_dirty(hdl->sa_spill, tx);
 					buftypes |= SA_SPILL;
 				}
 			}
 		}
 		if (error && error != ENOENT) {
 			return ((error == ECKSUM) ? EIO : error);
 		}
 
 		switch (data_op) {
 		case SA_LOOKUP:
 			if (bulk[i].sa_addr == NULL)
 				return (SET_ERROR(ENOENT));
 			if (bulk[i].sa_data) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_addr, bulk[i].sa_data,
 				    bulk[i].sa_size);
 			}
 			continue;
 
 		case SA_UPDATE:
 			/* existing rewrite of attr */
 			if (bulk[i].sa_addr &&
 			    bulk[i].sa_size == bulk[i].sa_length) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_addr,
 				    bulk[i].sa_length);
 				continue;
 			} else if (bulk[i].sa_addr) { /* attr size change */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_REPLACE, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			} else { /* adding new attribute */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_ADD, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			}
 			if (error)
 				return (error);
 			break;
 		}
 	}
 	return (error);
 }
 
 static sa_lot_t *
 sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
     uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, *findtb;
 	int i;
 	avl_index_t loc;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
 	tb->lot_attr_count = attr_count;
 	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
 	tb->lot_num = lot_num;
 	tb->lot_hash = hash;
 	tb->lot_instance = 0;
 
 	if (zapadd) {
 		char attr_name[8];
 
 		if (sa->sa_layout_attr_obj == 0) {
 			sa->sa_layout_attr_obj = zap_create_link(os,
 			    DMU_OT_SA_ATTR_LAYOUTS,
 			    sa->sa_master_obj, SA_LAYOUTS, tx);
 		}
 
 		(void) snprintf(attr_name, sizeof (attr_name),
 		    "%d", (int)lot_num);
 		VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
 		    attr_name, 2, attr_count, attrs, tx));
 	}
 
 	list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
 	    offsetof(sa_idx_tab_t, sa_next));
 
 	for (i = 0; i != attr_count; i++) {
 		if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
 			tb->lot_var_sizes++;
 	}
 
 	avl_add(&sa->sa_layout_num_tree, tb);
 
 	/* verify we don't have a hash collision */
 	if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
 		for (; findtb && findtb->lot_hash == hash;
 		    findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
 			if (findtb->lot_instance != tb->lot_instance)
 				break;
 			tb->lot_instance++;
 		}
 	}
 	avl_add(&sa->sa_layout_hash_tree, tb);
 	return (tb);
 }
 
 static void
 sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
     int count, dmu_tx_t *tx, sa_lot_t **lot)
 {
 	sa_lot_t *tb, tbsearch;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	boolean_t found = B_FALSE;
 
 	mutex_enter(&sa->sa_lock);
 	tbsearch.lot_hash = hash;
 	tbsearch.lot_instance = 0;
 	tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
 	if (tb) {
 		for (; tb && tb->lot_hash == hash;
 		    tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
 			if (sa_layout_equal(tb, attrs, count) == 0) {
 				found = B_TRUE;
 				break;
 			}
 		}
 	}
 	if (!found) {
 		tb = sa_add_layout_entry(os, attrs, count,
 		    avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
 	}
 	mutex_exit(&sa->sa_lock);
 	*lot = tb;
 }
 
 static int
 sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
 {
 	int error;
 	uint32_t blocksize;
 
 	if (size == 0) {
 		blocksize = SPA_MINBLOCKSIZE;
 	} else if (size > SPA_OLD_MAXBLOCKSIZE) {
 		ASSERT(0);
 		return (SET_ERROR(EFBIG));
 	} else {
 		blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
 	}
 
 	error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
 	ASSERT(error == 0);
 	return (error);
 }
 
 static void
 sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
 {
 	if (func == NULL) {
 		bcopy(datastart, target, buflen);
 	} else {
 		boolean_t start;
 		int bytes;
 		void *dataptr;
 		void *saptr = target;
 		uint32_t length;
 
 		start = B_TRUE;
 		bytes = 0;
 		while (bytes < buflen) {
 			func(&dataptr, &length, buflen, start, datastart);
 			bcopy(dataptr, saptr, length);
 			saptr = (void *)((caddr_t)saptr + length);
 			bytes += length;
 			start = B_FALSE;
 		}
 	}
 }
 
 /*
  * Determine several different sizes
  * first the sa header size
  * the number of bytes to be stored
  * if spill would occur the index in the attribute array is returned
  *
  * the boolean will_spill will be set when spilling is necessary.  It
  * is only set when the buftype is SA_BONUS
  */
 static int
 sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
     boolean_t *will_spill)
 {
 	int var_size = 0;
 	int i;
 	int full_space;
 	int hdrsize;
 	int extra_hdrsize;
 
 	if (buftype == SA_BONUS && sa->sa_force_spill) {
 		*total = 0;
 		*index = 0;
 		*will_spill = B_TRUE;
 		return (0);
 	}
 
 	*index = -1;
 	*total = 0;
 	*will_spill = B_FALSE;
 
 	extra_hdrsize = 0;
 	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
 	    sizeof (sa_hdr_phys_t);
 
 	full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
 	ASSERT(IS_P2ALIGNED(full_space, 8));
 
 	for (i = 0; i != attr_count; i++) {
 		boolean_t is_var_sz;
 
 		*total = P2ROUNDUP(*total, 8);
 		*total += attr_desc[i].sa_length;
 		if (*will_spill)
 			continue;
 
 		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
 		if (is_var_sz) {
 			var_size++;
 		}
 
 		if (is_var_sz && var_size > 1) {
 			/*
 			 * Don't worry that the spill block might overflow.
 			 * It will be resized if needed in sa_build_layouts().
 			 */
 			if (buftype == SA_SPILL ||
 			    P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
 			    *total < full_space) {
 				/*
 				 * Account for header space used by array of
 				 * optional sizes of variable-length attributes.
 				 * Record the extra header size in case this
 				 * increase needs to be reversed due to
 				 * spill-over.
 				 */
 				hdrsize += sizeof (uint16_t);
 				if (*index != -1)
 					extra_hdrsize += sizeof (uint16_t);
 			} else {
 				ASSERT(buftype == SA_BONUS);
 				if (*index == -1)
 					*index = i;
 				*will_spill = B_TRUE;
 				continue;
 			}
 		}
 
 		/*
 		 * find index of where spill *could* occur.
 		 * Then continue to count of remainder attribute
 		 * space.  The sum is used later for sizing bonus
 		 * and spill buffer.
 		 */
 		if (buftype == SA_BONUS && *index == -1 &&
 		    (*total + P2ROUNDUP(hdrsize, 8)) >
 		    (full_space - sizeof (blkptr_t))) {
 			*index = i;
 		}
 
 		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
 		    buftype == SA_BONUS)
 			*will_spill = B_TRUE;
 	}
 
 	if (*will_spill)
 		hdrsize -= extra_hdrsize;
 
 	hdrsize = P2ROUNDUP(hdrsize, 8);
 	return (hdrsize);
 }
 
 #define	BUF_SPACE_NEEDED(total, header) (total + header)
 
 /*
  * Find layout that corresponds to ordering of attributes
  * If not found a new layout number is created and added to
  * persistent layout tables.
  */
 static int
 sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	uint64_t hash;
 	sa_buf_type_t buftype;
 	sa_hdr_phys_t *sahdr;
 	void *data_start;
 	int buf_space;
 	sa_attr_type_t *attrs, *attrs_start;
 	int i, lot_count;
 	int hdrsize;
 	int spillhdrsize = 0;
 	int used;
 	dmu_object_type_t bonustype;
 	sa_lot_t *lot;
 	int len_idx;
 	int spill_used;
 	boolean_t spilling;
 
 	dmu_buf_will_dirty(hdl->sa_bonus, tx);
 	bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
 
 	/* first determine bonus header size and sum of all attributes */
 	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
 	    SA_BONUS, &i, &used, &spilling);
 
 	if (used > SPA_OLD_MAXBLOCKSIZE)
 		return (SET_ERROR(EFBIG));
 
 	VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
 	    MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
 	    used + hdrsize, tx));
 
 	ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
 	    bonustype == DMU_OT_SA);
 
 	/* setup and size spill buffer when needed */
 	if (spilling) {
 		boolean_t dummy;
 
 		if (hdl->sa_spill == NULL) {
 			VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
 			    &hdl->sa_spill) == 0);
 		}
 		dmu_buf_will_dirty(hdl->sa_spill, tx);
 
 		spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
 		    attr_count - i, hdl->sa_spill, SA_SPILL, &i,
 		    &spill_used, &dummy);
 
 		if (spill_used > SPA_OLD_MAXBLOCKSIZE)
 			return (SET_ERROR(EFBIG));
 
 		buf_space = hdl->sa_spill->db_size - spillhdrsize;
 		if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
 		    hdl->sa_spill->db_size)
 			VERIFY(0 == sa_resize_spill(hdl,
 			    BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
 	}
 
 	/* setup starting pointers to lay down data */
 	data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
 	sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
 	buftype = SA_BONUS;
 
 	if (spilling)
 		buf_space = (sa->sa_force_spill) ?
 		    0 : SA_BLKPTR_SPACE - hdrsize;
 	else
 		buf_space = hdl->sa_bonus->db_size - hdrsize;
 
 	attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	lot_count = 0;
 
 	for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
 		uint16_t length;
 
 		ASSERT(IS_P2ALIGNED(data_start, 8));
 		ASSERT(IS_P2ALIGNED(buf_space, 8));
 		attrs[i] = attr_desc[i].sa_attr;
 		length = SA_REGISTERED_LEN(sa, attrs[i]);
 		if (length == 0)
 			length = attr_desc[i].sa_length;
 		else
 			VERIFY(length == attr_desc[i].sa_length);
 
 		if (buf_space < length) {  /* switch to spill buffer */
 			VERIFY(spilling);
 			VERIFY(bonustype == DMU_OT_SA);
 			if (buftype == SA_BONUS && !sa->sa_force_spill) {
 				sa_find_layout(hdl->sa_os, hash, attrs_start,
 				    lot_count, tx, &lot);
 				SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
 			}
 
 			buftype = SA_SPILL;
 			hash = -1ULL;
 			len_idx = 0;
 
 			sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
 			sahdr->sa_magic = SA_MAGIC;
 			data_start = (void *)((uintptr_t)sahdr +
 			    spillhdrsize);
 			attrs_start = &attrs[i];
 			buf_space = hdl->sa_spill->db_size - spillhdrsize;
 			lot_count = 0;
 		}
 		hash ^= SA_ATTR_HASH(attrs[i]);
 		attr_desc[i].sa_addr = data_start;
 		attr_desc[i].sa_size = length;
 		SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
 		    data_start, length);
 		if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
 			sahdr->sa_lengths[len_idx++] = length;
 		}
 		VERIFY((uintptr_t)data_start % 8 == 0);
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    length), 8);
 		buf_space -= P2ROUNDUP(length, 8);
 		lot_count++;
 	}
 
 	sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
 
 	/*
 	 * Verify that old znodes always have layout number 0.
 	 * Must be DMU_OT_SA for arbitrary layouts
 	 */
 	VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
 	    (bonustype == DMU_OT_SA && lot->lot_num > 1));
 
 	if (bonustype == DMU_OT_SA) {
 		SA_SET_HDR(sahdr, lot->lot_num,
 		    buftype == SA_BONUS ? hdrsize : spillhdrsize);
 	}
 
 	kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
 	if (hdl->sa_bonus_tab) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 		hdl->sa_bonus_tab = NULL;
 	}
 	if (!sa->sa_force_spill)
 		VERIFY(0 == sa_build_index(hdl, SA_BONUS));
 	if (hdl->sa_spill) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 		if (!spilling) {
 			/*
 			 * remove spill block that is no longer needed.
 			 */
 			dmu_buf_rele(hdl->sa_spill, NULL);
 			hdl->sa_spill = NULL;
 			hdl->sa_spill_tab = NULL;
 			VERIFY(0 == dmu_rm_spill(hdl->sa_os,
 			    sa_handle_object(hdl), tx));
 		} else {
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 		}
 	}
 
 	return (0);
 }
 
 static void
 sa_free_attr_table(sa_os_t *sa)
 {
 	int i;
 
 	if (sa->sa_attr_table == NULL)
 		return;
 
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_name)
 			kmem_free(sa->sa_attr_table[i].sa_name,
 			    strlen(sa->sa_attr_table[i].sa_name) + 1);
 	}
 
 	kmem_free(sa->sa_attr_table,
 	    sizeof (sa_attr_table_t) * sa->sa_num_attrs);
 
 	sa->sa_attr_table = NULL;
 }
 
 static int
 sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
 {
 	sa_os_t *sa = os->os_sa;
 	uint64_t sa_attr_count = 0;
 	uint64_t sa_reg_count = 0;
 	int error = 0;
 	uint64_t attr_value;
 	sa_attr_table_t *tb;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int registered_count = 0;
 	int i;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 
 	sa->sa_user_table =
 	    kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
 	sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
 
 	if (sa->sa_reg_attr_obj != 0) {
 		error = zap_count(os, sa->sa_reg_attr_obj,
 		    &sa_attr_count);
 
 		/*
 		 * Make sure we retrieved a count and that it isn't zero
 		 */
 		if (error || (error == 0 && sa_attr_count == 0)) {
 			if (error == 0)
 				error = SET_ERROR(EINVAL);
 			goto bail;
 		}
 		sa_reg_count = sa_attr_count;
 	}
 
 	if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
 		sa_attr_count += sa_legacy_attr_count;
 
 	/* Allocate attribute numbers for attributes that aren't registered */
 	for (i = 0; i != count; i++) {
 		boolean_t found = B_FALSE;
 		int j;
 
 		if (ostype == DMU_OST_ZFS) {
 			for (j = 0; j != sa_legacy_attr_count; j++) {
 				if (strcmp(reg_attrs[i].sa_name,
 				    sa_legacy_attrs[j].sa_name) == 0) {
 					sa->sa_user_table[i] =
 					    sa_legacy_attrs[j].sa_attr;
 					found = B_TRUE;
 				}
 			}
 		}
 		if (found)
 			continue;
 
 		if (sa->sa_reg_attr_obj)
 			error = zap_lookup(os, sa->sa_reg_attr_obj,
 			    reg_attrs[i].sa_name, 8, 1, &attr_value);
 		else
 			error = SET_ERROR(ENOENT);
 		switch (error) {
 		case ENOENT:
 			sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
 			sa_attr_count++;
 			break;
 		case 0:
 			sa->sa_user_table[i] = ATTR_NUM(attr_value);
 			break;
 		default:
 			goto bail;
 		}
 	}
 
 	sa->sa_num_attrs = sa_attr_count;
 	tb = sa->sa_attr_table =
 	    kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
 
 	/*
 	 * Attribute table is constructed from requested attribute list,
 	 * previously foreign registered attributes, and also the legacy
 	 * ZPL set of attributes.
 	 */
 
 	if (sa->sa_reg_attr_obj) {
 		for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t value;
 			value  = za.za_first_integer;
 
 			registered_count++;
 			tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
 			tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
 			tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
 			tb[ATTR_NUM(value)].sa_registered = B_TRUE;
 
 			if (tb[ATTR_NUM(value)].sa_name) {
 				continue;
 			}
 			tb[ATTR_NUM(value)].sa_name =
 			    kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
 			(void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
 			    strlen(za.za_name) +1);
 		}
 		zap_cursor_fini(&zc);
 		/*
 		 * Make sure we processed the correct number of registered
 		 * attributes
 		 */
 		if (registered_count != sa_reg_count) {
 			ASSERT(error != 0);
 			goto bail;
 		}
 
 	}
 
 	if (ostype == DMU_OST_ZFS) {
 		for (i = 0; i != sa_legacy_attr_count; i++) {
 			if (tb[i].sa_name)
 				continue;
 			tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
 			tb[i].sa_length = sa_legacy_attrs[i].sa_length;
 			tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
 			tb[i].sa_registered = B_FALSE;
 			tb[i].sa_name =
 			    kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
 			    KM_SLEEP);
 			(void) strlcpy(tb[i].sa_name,
 			    sa_legacy_attrs[i].sa_name,
 			    strlen(sa_legacy_attrs[i].sa_name) + 1);
 		}
 	}
 
 	for (i = 0; i != count; i++) {
 		sa_attr_type_t attr_id;
 
 		attr_id = sa->sa_user_table[i];
 		if (tb[attr_id].sa_name)
 			continue;
 
 		tb[attr_id].sa_length = reg_attrs[i].sa_length;
 		tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
 		tb[attr_id].sa_attr = attr_id;
 		tb[attr_id].sa_name =
 		    kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
 		(void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
 		    strlen(reg_attrs[i].sa_name) + 1);
 	}
 
 	sa->sa_need_attr_registration =
 	    (sa_attr_count != registered_count);
 
 	return (0);
 bail:
 	kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
 	sa->sa_user_table = NULL;
 	sa_free_attr_table(sa);
 	return ((error != 0) ? error : EINVAL);
 }
 
 int
 sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
     sa_attr_type_t **user_table)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	sa_os_t *sa;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 	sa_attr_type_t *tb;
 	int error;
 
 	mutex_enter(&os->os_user_ptr_lock);
 	if (os->os_sa) {
 		mutex_enter(&os->os_sa->sa_lock);
 		mutex_exit(&os->os_user_ptr_lock);
 		tb = os->os_sa->sa_user_table;
 		mutex_exit(&os->os_sa->sa_lock);
 		*user_table = tb;
 		return (0);
 	}
 
 	sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
 	mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	sa->sa_master_obj = sa_obj;
 
 	os->os_sa = sa;
 	mutex_enter(&sa->sa_lock);
 	mutex_exit(&os->os_user_ptr_lock);
 	avl_create(&sa->sa_layout_num_tree, layout_num_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
 	avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
 
 	if (sa_obj) {
 		error = zap_lookup(os, sa_obj, SA_LAYOUTS,
 		    8, 1, &sa->sa_layout_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 		error = zap_lookup(os, sa_obj, SA_REGISTRY,
 		    8, 1, &sa->sa_reg_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 	}
 
 	if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
 		goto fail;
 
 	if (sa->sa_layout_attr_obj != 0) {
 		uint64_t layout_count;
 
 		error = zap_count(os, sa->sa_layout_attr_obj,
 		    &layout_count);
 
 		/*
 		 * Layout number count should be > 0
 		 */
 		if (error || (error == 0 && layout_count == 0)) {
 			if (error == 0)
 				error = SET_ERROR(EINVAL);
 			goto fail;
 		}
 
 		for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			sa_attr_type_t *lot_attrs;
 			uint64_t lot_num;
 
 			lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
 			    za.za_num_integers, KM_SLEEP);
 
 			if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
 			    za.za_name, 2, za.za_num_integers,
 			    lot_attrs))) != 0) {
 				kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 				    za.za_num_integers);
 				break;
 			}
 			VERIFY(ddi_strtoull(za.za_name, NULL, 10,
 			    (unsigned long long *)&lot_num) == 0);
 
 			(void) sa_add_layout_entry(os, lot_attrs,
 			    za.za_num_integers, lot_num,
 			    sa_layout_info_hash(lot_attrs,
 			    za.za_num_integers), B_FALSE, NULL);
 			kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 			    za.za_num_integers);
 		}
 		zap_cursor_fini(&zc);
 
 		/*
 		 * Make sure layout count matches number of entries added
 		 * to AVL tree
 		 */
 		if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
 			ASSERT(error != 0);
 			goto fail;
 		}
 	}
 
 	/* Add special layout number for old ZNODES */
 	if (ostype == DMU_OST_ZFS) {
 		(void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
 		    sa_legacy_attr_count, 0,
 		    sa_layout_info_hash(sa_legacy_zpl_layout,
 		    sa_legacy_attr_count), B_FALSE, NULL);
 
 		(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
 		    0, B_FALSE, NULL);
 	}
 	*user_table = os->os_sa->sa_user_table;
 	mutex_exit(&sa->sa_lock);
 	return (0);
 fail:
 	os->os_sa = NULL;
 	sa_free_attr_table(sa);
 	if (sa->sa_user_table)
 		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 	mutex_exit(&sa->sa_lock);
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
 	mutex_destroy(&sa->sa_lock);
 	kmem_free(sa, sizeof (sa_os_t));
 	return ((error == ECKSUM) ? EIO : error);
 }
 
 void
 sa_tear_down(objset_t *os)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *layout;
 	void *cookie;
 
 	kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 
 	/* Free up attr table */
 
 	sa_free_attr_table(sa);
 
 	cookie = NULL;
 	while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
 		sa_idx_tab_t *tab;
 		while (tab = list_head(&layout->lot_idx_tab)) {
 			ASSERT(refcount_count(&tab->sa_refcount));
 			sa_idx_tab_rele(os, tab);
 		}
 	}
 
 	cookie = NULL;
 	while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
 		kmem_free(layout->lot_attrs,
 		    sizeof (sa_attr_type_t) * layout->lot_attr_count);
 		kmem_free(layout, sizeof (sa_lot_t));
 	}
 
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
 	mutex_destroy(&sa->sa_lock);
 
 	kmem_free(sa, sizeof (sa_os_t));
 	os->os_sa = NULL;
 }
 
 void
 sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t var_length, void *userp)
 {
 	sa_idx_tab_t *idx_tab = userp;
 
 	if (var_length) {
 		ASSERT(idx_tab->sa_variable_lengths);
 		idx_tab->sa_variable_lengths[length_idx] = length;
 	}
 	TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
 	    (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
 }
 
 static void
 sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
     sa_iterfunc_t func, sa_lot_t *tab, void *userp)
 {
 	void *data_start;
 	sa_lot_t *tb = tab;
 	sa_lot_t search;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	int i;
 	uint16_t *length_start = NULL;
 	uint8_t length_idx = 0;
 
 	if (tab == NULL) {
 		search.lot_num = SA_LAYOUT_NUM(hdr, type);
 		tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 		ASSERT(tb);
 	}
 
 	if (IS_SA_BONUSTYPE(type)) {
 		data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
 		    offsetof(sa_hdr_phys_t, sa_lengths) +
 		    (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
 		length_start = hdr->sa_lengths;
 	} else {
 		data_start = hdr;
 	}
 
 	for (i = 0; i != tb->lot_attr_count; i++) {
 		int attr_length, reg_length;
 		uint8_t idx_len;
 
 		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
 		if (reg_length) {
 			attr_length = reg_length;
 			idx_len = 0;
 		} else {
 			attr_length = length_start[length_idx];
 			idx_len = length_idx++;
 		}
 
 		func(hdr, data_start, tb->lot_attrs[i], attr_length,
 		    idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
 
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    attr_length), 8);
 	}
 }
 
 /*ARGSUSED*/
 void
 sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t variable_length, void *userp)
 {
 	sa_handle_t *hdl = userp;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
 }
 
 void
 sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 	dmu_buf_impl_t *db;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	int num_lengths = 1;
 	int i;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	if (sa_hdr_phys->sa_magic == SA_MAGIC)
 		return;
 
 	db = SA_GET_DB(hdl, buftype);
 
 	if (buftype == SA_SPILL) {
 		arc_release(db->db_buf, NULL);
 		arc_buf_thaw(db->db_buf);
 	}
 
 	sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
 	sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
 
 	/*
 	 * Determine number of variable lenghts in header
 	 * The standard 8 byte header has one for free and a
 	 * 16 byte header would have 4 + 1;
 	 */
 	if (SA_HDR_SIZE(sa_hdr_phys) > 8)
 		num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
 	for (i = 0; i != num_lengths; i++)
 		sa_hdr_phys->sa_lengths[i] =
 		    BSWAP_16(sa_hdr_phys->sa_lengths[i]);
 
 	sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
 	    sa_byteswap_cb, NULL, hdl);
 
 	if (buftype == SA_SPILL)
 		arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
 }
 
 static int
 sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys;
 	dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
 	dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_idx_tab_t *idx_tab;
 
 	sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 
 	mutex_enter(&sa->sa_lock);
 
 	/* Do we need to byteswap? */
 
 	/* only check if not old znode */
 	if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
 	    sa_hdr_phys->sa_magic != 0) {
 		VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
 		sa_byteswap(hdl, buftype);
 	}
 
 	idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
 
 	if (buftype == SA_BONUS)
 		hdl->sa_bonus_tab = idx_tab;
 	else
 		hdl->sa_spill_tab = idx_tab;
 
 	mutex_exit(&sa->sa_lock);
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 sa_evict(void *dbu)
 {
 	panic("evicting sa dbuf\n");
 }
 
 static void
 sa_idx_tab_rele(objset_t *os, void *arg)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_idx_tab_t *idx_tab = arg;
 
 	if (idx_tab == NULL)
 		return;
 
 	mutex_enter(&sa->sa_lock);
 	if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
 		list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
 		if (idx_tab->sa_variable_lengths)
 			kmem_free(idx_tab->sa_variable_lengths,
 			    sizeof (uint16_t) *
 			    idx_tab->sa_layout->lot_var_sizes);
 		refcount_destroy(&idx_tab->sa_refcount);
 		kmem_free(idx_tab->sa_idx_tab,
 		    sizeof (uint32_t) * sa->sa_num_attrs);
 		kmem_free(idx_tab, sizeof (sa_idx_tab_t));
 	}
 	mutex_exit(&sa->sa_lock);
 }
 
 static void
 sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
 {
 	sa_os_t *sa = os->os_sa;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	(void) refcount_add(&idx_tab->sa_refcount, NULL);
 }
 
 void
 sa_handle_destroy(sa_handle_t *hdl)
 {
 	dmu_buf_t *db = hdl->sa_bonus;
 
 	mutex_enter(&hdl->sa_lock);
 	(void) dmu_buf_remove_user(db, &hdl->sa_dbu);
 
 	if (hdl->sa_bonus_tab)
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 
 	if (hdl->sa_spill_tab)
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 
 	dmu_buf_rele(hdl->sa_bonus, NULL);
 
 	if (hdl->sa_spill)
 		dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
 	mutex_exit(&hdl->sa_lock);
 
 	kmem_cache_free(sa_cache, hdl);
 }
 
 int
 sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	int error = 0;
 	dmu_object_info_t doi;
 	sa_handle_t *handle = NULL;
 
 #ifdef ZFS_DEBUG
 	dmu_object_info_from_db(db, &doi);
 	ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
 	    doi.doi_bonus_type == DMU_OT_ZNODE);
 #endif
 	/* find handle, if it exists */
 	/* if one doesn't exist then create a new one, and initialize it */
 
 	if (hdl_type == SA_HDL_SHARED)
 		handle = dmu_buf_get_user(db);
 
 	if (handle == NULL) {
 		sa_handle_t *winner = NULL;
 
 		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
 		handle->sa_dbu.dbu_evict_func = NULL;
 		handle->sa_userp = userp;
 		handle->sa_bonus = db;
 		handle->sa_os = os;
 		handle->sa_spill = NULL;
 		handle->sa_bonus_tab = NULL;
 		handle->sa_spill_tab = NULL;
 
 		error = sa_build_index(handle, SA_BONUS);
 
 		if (hdl_type == SA_HDL_SHARED) {
 			dmu_buf_init_user(&handle->sa_dbu, sa_evict, NULL);
 			winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
 		}
 
 		if (winner != NULL) {
 			kmem_cache_free(sa_cache, handle);
 			handle = winner;
 		}
 	}
 	*handlepp = handle;
 
 	return (error);
 }
 
 int
 sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	dmu_buf_t *db;
 	int error;
 
 	if (error = dmu_bonus_hold(objset, objid, NULL, &db))
 		return (error);
 
 	return (sa_handle_get_from_db(objset, db, userp, hdl_type,
 	    handlepp));
 }
 
 int
 sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
 {
 	return (dmu_bonus_hold(objset, obj_num, tag, db));
 }
 
 void
 sa_buf_rele(dmu_buf_t *db, void *tag)
 {
 	dmu_buf_rele(db, tag);
 }
 
 int
 sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
 }
 
 int
 sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_attr = attr;
 	bulk.sa_data = buf;
 	bulk.sa_length = buflen;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_lookup_impl(hdl, &bulk, 1);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 #ifdef _KERNEL
 int
 sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
 		error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
 		    uio->uio_resid), UIO_READ, uio);
 	}
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 
 }
 #endif
 
 void *
 sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
 {
 	sa_idx_tab_t *idx_tab;
 	sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data;
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, search;
 	avl_index_t loc;
 
 	/*
 	 * Deterimine layout number.  If SA node and header == 0 then
 	 * force the index table to the dummy "1" empty layout.
 	 *
 	 * The layout number would only be zero for a newly created file
 	 * that has not added any attributes yet, or with crypto enabled which
 	 * doesn't write any attributes to the bonus buffer.
 	 */
 
 	search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
 
 	tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 
 	/* Verify header size is consistent with layout information */
 	ASSERT(tb);
 	ASSERT(IS_SA_BONUSTYPE(bonustype) &&
 	    SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
 	    (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
 
 	/*
 	 * See if any of the already existing TOC entries can be reused?
 	 */
 
 	for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
 	    idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
 		boolean_t valid_idx = B_TRUE;
 		int i;
 
 		if (tb->lot_var_sizes != 0 &&
 		    idx_tab->sa_variable_lengths != NULL) {
 			for (i = 0; i != tb->lot_var_sizes; i++) {
 				if (hdr->sa_lengths[i] !=
 				    idx_tab->sa_variable_lengths[i]) {
 					valid_idx = B_FALSE;
 					break;
 				}
 			}
 		}
 		if (valid_idx) {
 			sa_idx_tab_hold(os, idx_tab);
 			return (idx_tab);
 		}
 	}
 
 	/* No such luck, create a new entry */
 	idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
 	idx_tab->sa_idx_tab =
 	    kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
 	idx_tab->sa_layout = tb;
 	refcount_create(&idx_tab->sa_refcount);
 	if (tb->lot_var_sizes)
 		idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
 		    tb->lot_var_sizes, KM_SLEEP);
 
 	sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
 	    tb, idx_tab);
 	sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
 	sa_idx_tab_hold(os, idx_tab);	/* one for layout */
 	list_insert_tail(&tb->lot_idx_tab, idx_tab);
 	return (idx_tab);
 }
 
 void
 sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
     boolean_t start, void *userdata)
 {
 	ASSERT(start);
 
 	*dataptr = userdata;
 	*len = total_len;
 }
 
 static void
 sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
 {
 	uint64_t attr_value = 0;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_attr_table_t *tb = sa->sa_attr_table;
 	int i;
 
 	mutex_enter(&sa->sa_lock);
 
 	if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
 		mutex_exit(&sa->sa_lock);
 		return;
 	}
 
 	if (sa->sa_reg_attr_obj == 0) {
 		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
 		    DMU_OT_SA_ATTR_REGISTRATION,
 		    sa->sa_master_obj, SA_REGISTRY, tx);
 	}
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_registered)
 			continue;
 		ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
 		    tb[i].sa_byteswap);
 		VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
 		    tb[i].sa_name, 8, 1, &attr_value, tx));
 		tb[i].sa_registered = B_TRUE;
 	}
 	sa->sa_need_attr_registration = B_FALSE;
 	mutex_exit(&sa->sa_lock);
 }
 
 /*
  * Replace all attributes with attributes specified in template.
  * If dnode had a spill buffer then those attributes will be
  * also be replaced, possibly with just an empty spill block
  *
  * This interface is intended to only be used for bulk adding of
  * attributes for a new file.  It will also be used by the ZPL
  * when converting and old formatted znode to native SA support.
  */
 int
 sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 	return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
 }
 
 int
 sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_replace_all_by_template_locked(hdl, attr_desc,
 	    attr_count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Add/remove a single attribute or replace a variable-sized attribute value
  * with a value of a different size, and then rewrite the entire set
  * of attributes.
  * Same-length attribute value replacement (including fixed-length attributes)
  * is handled more efficiently by the upper layers.
  */
 static int
 sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 	dnode_t *dn;
 	sa_bulk_attr_t *attr_desc;
 	void *old_data[2];
 	int bonus_attr_count = 0;
 	int bonus_data_size = 0;
 	int spill_data_size = 0;
 	int spill_attr_count = 0;
 	int error;
 	uint16_t length, reg_length;
 	int i, j, k, length_idx;
 	sa_hdr_phys_t *hdr;
 	sa_idx_tab_t *idx_tab;
 	int attr_count;
 	int count;
 
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	/* First make of copy of the old data */
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn->dn_bonuslen != 0) {
 		bonus_data_size = hdl->sa_bonus->db_size;
 		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
 		bcopy(hdl->sa_bonus->db_data, old_data[0],
 		    hdl->sa_bonus->db_size);
 		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
 	} else {
 		old_data[0] = NULL;
 	}
 	DB_DNODE_EXIT(db);
 
 	/* Bring spill buffer online if it isn't currently */
 
 	if ((error = sa_get_spill(hdl)) == 0) {
 		spill_data_size = hdl->sa_spill->db_size;
 		old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
 		bcopy(hdl->sa_spill->db_data, old_data[1],
 		    hdl->sa_spill->db_size);
 		spill_attr_count =
 		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
 	} else if (error && error != ENOENT) {
 		if (old_data[0])
 			kmem_free(old_data[0], bonus_data_size);
 		return (error);
 	} else {
 		old_data[1] = NULL;
 	}
 
 	/* build descriptor of all attributes */
 
 	attr_count = bonus_attr_count + spill_attr_count;
 	if (action == SA_ADD)
 		attr_count++;
 	else if (action == SA_REMOVE)
 		attr_count--;
 
 	attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
 
 	/*
 	 * loop through bonus and spill buffer if it exists, and
 	 * build up new attr_descriptor to reset the attributes
 	 */
 	k = j = 0;
 	count = bonus_attr_count;
 	hdr = SA_GET_HDR(hdl, SA_BONUS);
 	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
 	for (; k != 2; k++) {
 		/*
 		 * Iterate over each attribute in layout.  Fetch the
 		 * size of variable-length attributes needing rewrite
 		 * from sa_lengths[].
 		 */
 		for (i = 0, length_idx = 0; i != count; i++) {
 			sa_attr_type_t attr;
 
 			attr = idx_tab->sa_layout->lot_attrs[i];
 			reg_length = SA_REGISTERED_LEN(sa, attr);
 			if (reg_length == 0) {
 				length = hdr->sa_lengths[length_idx];
 				length_idx++;
 			} else {
 				length = reg_length;
 			}
 			if (attr == newattr) {
 				/*
 				 * There is nothing to do for SA_REMOVE,
 				 * so it is just skipped.
 				 */
 				if (action == SA_REMOVE)
 					continue;
 
 				/*
 				 * Duplicate attributes are not allowed, so the
 				 * action can not be SA_ADD here.
 				 */
 				ASSERT3S(action, ==, SA_REPLACE);
 
 				/*
 				 * Only a variable-sized attribute can be
 				 * replaced here, and its size must be changing.
 				 */
 				ASSERT3U(reg_length, ==, 0);
 				ASSERT3U(length, !=, buflen);
 				SA_ADD_BULK_ATTR(attr_desc, j, attr,
 				    locator, datastart, buflen);
 			} else {
 				SA_ADD_BULK_ATTR(attr_desc, j, attr,
 				    NULL, (void *)
 				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
 				    (uintptr_t)old_data[k]), length);
 			}
 		}
 		if (k == 0 && hdl->sa_spill) {
 			hdr = SA_GET_HDR(hdl, SA_SPILL);
 			idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
 			count = spill_attr_count;
 		} else {
 			break;
 		}
 	}
 	if (action == SA_ADD) {
 		reg_length = SA_REGISTERED_LEN(sa, newattr);
 		IMPLY(reg_length != 0, reg_length == buflen);
 		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
 		    datastart, buflen);
 	}
 	ASSERT3U(j, ==, attr_count);
 
 	error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
 
 	if (old_data[0])
 		kmem_free(old_data[0], bonus_data_size);
 	if (old_data[1])
 		kmem_free(old_data[1], spill_data_size);
 	kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
 
 	return (error);
 }
 
 static int
 sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     dmu_tx_t *tx)
 {
 	int error;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_object_type_t bonustype;
 
 	bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
 
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	/* sync out registration table if necessary */
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 
 	error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
 	if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
 		sa->sa_update_cb(hdl, tx);
 
 	return (error);
 }
 
 /*
  * update or add new attribute
  */
 int
 sa_update(sa_handle_t *hdl, sa_attr_type_t type,
     void *buf, uint32_t buflen, dmu_tx_t *tx)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_attr = type;
 	bulk.sa_data_func = NULL;
 	bulk.sa_length = buflen;
 	bulk.sa_data = buf;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
     uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_attr = attr;
 	bulk.sa_data = userdata;
 	bulk.sa_data_func = locator;
 	bulk.sa_length = buflen;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Return size of an attribute
  */
 
 int
 sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
 {
 	sa_bulk_attr_t bulk;
 	int error;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
 		mutex_exit(&hdl->sa_lock);
 		return (error);
 	}
 	*size = bulk.sa_size;
 
 	mutex_exit(&hdl->sa_lock);
 	return (0);
 }
 
 int
 sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_lookup_impl(hdl, attrs, count));
 }
 
 int
 sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_lookup_locked(hdl, attrs, count);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, attrs, count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
 	    NULL, 0, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 void
 sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
 {
 	dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
 }
 
 void
 sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
 {
 	dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
 	    blksize, nblocks);
 }
 
 void
 sa_set_userp(sa_handle_t *hdl, void *ptr)
 {
 	hdl->sa_userp = ptr;
 }
 
 dmu_buf_t *
 sa_get_db(sa_handle_t *hdl)
 {
 	return ((dmu_buf_t *)hdl->sa_bonus);
 }
 
 void *
 sa_get_userdata(sa_handle_t *hdl)
 {
 	return (hdl->sa_userp);
 }
 
 void
 sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
 {
 	ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
 	os->os_sa->sa_update_cb = func;
 }
 
 void
 sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
 {
 
 	mutex_enter(&os->os_sa->sa_lock);
 	sa_register_update_callback_locked(os, func);
 	mutex_exit(&os->os_sa->sa_lock);
 }
 
 uint64_t
 sa_handle_object(sa_handle_t *hdl)
 {
 	return (hdl->sa_bonus->db_object);
 }
 
 boolean_t
 sa_enabled(objset_t *os)
 {
 	return (os->os_sa == NULL);
 }
 
 int
 sa_set_sa_object(objset_t *os, uint64_t sa_object)
 {
 	sa_os_t *sa = os->os_sa;
 
 	if (sa->sa_master_obj)
 		return (1);
 
 	sa->sa_master_obj = sa_object;
 
 	return (0);
 }
 
 int
 sa_hdrsize(void *arg)
 {
 	sa_hdr_phys_t *hdr = arg;
 
 	return (SA_HDR_SIZE(hdr));
 }
 
 void
 sa_handle_lock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 }
 
 void
 sa_handle_unlock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_exit(&hdl->sa_lock);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(revision 296519)
@@ -1,7055 +1,7056 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /*
  * SPA: Storage Pool Allocator
  *
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
  * pool.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 #include <sys/zvol.h>
 #include <sys/trim_map.h>
 
 #ifdef	_KERNEL
 #include <sys/callb.h>
 #include <sys/cpupart.h>
 #include <sys/zone.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
 /* Check hostid on import? */
 static int check_hostid = 1;
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
  * should be retried.
  */
 static int zfs_ccw_retry_interval = 300;
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
     "Check hostid on import?");
 TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
     &zfs_ccw_retry_interval, 0,
     "Configuration cache file write, retry after failure, interval (seconds)");
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
 	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
 #define	ZTI_ONE		ZTI_N(1)
 
 typedef struct zio_taskq_info {
 	zti_modes_t zti_mode;
 	uint_t zti_value;
 	uint_t zti_count;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 	"issue", "issue_high", "intr", "intr_high"
 };
 
 /*
  * This table defines the taskq settings for each ZFS I/O type. When
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  * macros. Other operations process a large amount of data; the ZTI_BATCH
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
  * particular taskq is chosen at random.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
  * need to be handled with minimum delay.
  */
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
 	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(8),	ZTI_N(5) }, /* WRITE */
 	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
 #ifdef PSRSET_BIND
 id_t		zio_taskq_psrset_bind = PS_NONE;
 #endif
 #ifdef SYSDC
 boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
 #endif
 uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
 
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
 extern int	zfs_sync_pass_deferred_free;
 
 #ifndef illumos
 extern void spa_deadman(void *arg);
 #endif
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
  */
 #define	TRYIMPORT_NAME	"$import"
 
 /*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
  */
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	const char *propname = zpool_prop_to_name(prop);
 	nvlist_t *propval;
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 
 	if (strval != NULL)
 		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 	else
 		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 
 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 	nvlist_free(propval);
 }
 
 /*
  * Get property values from the spa configuration.
  */
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
 	uint64_t size, alloc, cap, version;
 	zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 	metaslab_class_t *mc = spa_normal_class(spa);
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
 	if (rvd != NULL) {
 		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 		size = metaslab_class_get_space(spa_normal_class(spa));
 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
 		    metaslab_class_expandable_space(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == FREAD), src);
 
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 			src = ZPROP_SRC_DEFAULT;
 		else
 			src = ZPROP_SRC_LOCAL;
 		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 	}
 
 	if (pool != NULL) {
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
 		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
 			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 
 		if (pool->dp_leak_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
 			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
 			    NULL, 0, src);
 		}
 	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 		    0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
 	}
 
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    "none", 0, ZPROP_SRC_LOCAL);
 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
 		}
 	}
 }
 
 /*
  * Get zpool property values.
  */
 int
 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int err;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	mutex_enter(&spa->spa_props_lock);
 
 	/*
 	 * Get properties from the spa config.
 	 */
 	spa_prop_get_config(spa, nvp);
 
 	/* If no pool property object, no more prop to get. */
 	if (mos == NULL || spa->spa_pool_props_object == 0) {
 		mutex_exit(&spa->spa_props_lock);
 		return (0);
 	}
 
 	/*
 	 * Get properties from the MOS pool property object.
 	 */
 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t intval = 0;
 		char *strval = NULL;
 		zprop_source_t src = ZPROP_SRC_DEFAULT;
 		zpool_prop_t prop;
 
 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
 			continue;
 
 		switch (za.za_integer_length) {
 		case 8:
 			/* integer property */
 			if (za.za_first_integer !=
 			    zpool_prop_default_numeric(prop))
 				src = ZPROP_SRC_LOCAL;
 
 			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_pool_t *dp;
 				dsl_dataset_t *ds = NULL;
 
 				dp = spa_get_dsl(spa);
 				dsl_pool_config_enter(dp, FTAG);
 				if (err = dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &ds)) {
 					dsl_pool_config_exit(dp, FTAG);
 					break;
 				}
 
 				strval = kmem_alloc(
 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
 				dsl_pool_config_exit(dp, FTAG);
 			} else {
 				strval = NULL;
 				intval = za.za_first_integer;
 			}
 
 			spa_prop_add_list(*nvp, prop, strval, intval, src);
 
 			if (strval != NULL)
 				kmem_free(strval,
 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
 
 			break;
 
 		case 1:
 			/* string property */
 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 			err = zap_lookup(mos, spa->spa_pool_props_object,
 			    za.za_name, 1, za.za_num_integers, strval);
 			if (err) {
 				kmem_free(strval, za.za_num_integers);
 				break;
 			}
 			spa_prop_add_list(*nvp, prop, strval, 0, src);
 			kmem_free(strval, za.za_num_integers);
 			break;
 
 		default:
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 	mutex_exit(&spa->spa_props_lock);
 out:
 	if (err && err != ENOENT) {
 		nvlist_free(*nvp);
 		*nvp = NULL;
 		return (err);
 	}
 
 	return (0);
 }
 
 /*
  * Validate the given pool properties nvlist and modify the list
  * for the property values to be set.
  */
 static int
 spa_prop_validate(spa_t *spa, nvlist_t *props)
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
 	uint64_t objnum = 0;
 	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		uint64_t intval;
 		char *strval, *slash, *check, *fname;
 		const char *propname = nvpair_name(elem);
 		zpool_prop_t prop = zpool_name_to_prop(propname);
 
 		switch (prop) {
 		case ZPROP_INVAL:
 			if (!zpool_prop_feature(propname)) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			/*
 			 * Sanitize the input.
 			 */
 			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			if (intval != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			fname = strchr(propname, '@') + 1;
 			if (zfeature_lookup_name(fname, NULL) != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			has_feature = B_TRUE;
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
 			    (intval < spa_version(spa) ||
 			    intval > SPA_VERSION_BEFORE_FEATURES ||
 			    has_feature))
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_DELEGATION:
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
 		case ZPOOL_PROP_AUTOEXPAND:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
 			/*
 			 * If the pool version is less than SPA_VERSION_BOOTFS,
 			 * or the pool is still being created (version == 0),
 			 * the bootfs property cannot be set.
 			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			/*
 			 * Make sure the vdev config is bootable
 			 */
 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
 				objset_t *os;
 				uint64_t propval;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
 					    ZPOOL_PROP_BOOTFS);
 					break;
 				}
 
 				if (error = dmu_objset_hold(strval, FTAG, &os))
 					break;
 
 				/*
 				 * Must be ZPL, and its property settings
 				 * must be supported by GRUB (compression
 				 * is not gzip, and large blocks are not used).
 				 */
 
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
 					error = SET_ERROR(ENOTSUP);
 				} else if ((error =
 				    dsl_prop_get_int_ds(dmu_objset_ds(os),
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    &propval)) == 0 &&
 				    !BOOTFS_COMPRESS_VALID(propval)) {
 					error = SET_ERROR(ENOTSUP);
 				} else if ((error =
 				    dsl_prop_get_int_ds(dmu_objset_ds(os),
 				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 				    &propval)) == 0 &&
 				    propval > SPA_OLD_MAXBLOCKSIZE) {
 					error = SET_ERROR(ENOTSUP);
 				} else {
 					objnum = dmu_objset_id(os);
 				}
 				dmu_objset_rele(os, FTAG);
 			}
 			break;
 
 		case ZPOOL_PROP_FAILUREMODE:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 			    intval > ZIO_FAILURE_MODE_PANIC))
 				error = SET_ERROR(EINVAL);
 
 			/*
 			 * This is a special case which only occurs when
 			 * the pool has completely failed. This allows
 			 * the user to change the in-core failmode property
 			 * without syncing it out to disk (I/Os might
 			 * currently be blocked). We do this by returning
 			 * EIO to the caller (spa_prop_set) to trick it
 			 * into thinking we encountered a property validation
 			 * error.
 			 */
 			if (!error && spa_suspended(spa)) {
 				spa->spa_failmode = intval;
 				error = SET_ERROR(EIO);
 			}
 			break;
 
 		case ZPOOL_PROP_CACHEFILE:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 
 			if (strval[0] == '\0')
 				break;
 
 			if (strcmp(strval, "none") == 0)
 				break;
 
 			if (strval[0] != '/') {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			slash = strrchr(strval, '/');
 			ASSERT(slash != NULL);
 
 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 			    strcmp(slash, "/..") == 0)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_COMMENT:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 			for (check = strval; *check != '\0'; check++) {
 				/*
 				 * The kernel doesn't have an easy isprint()
 				 * check.  For this kernel check, we merely
 				 * check ASCII apart from DEL.  Fix this if
 				 * there is an easy-to-use kernel isprint().
 				 */
 				if (*check >= 0x7f) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 			}
 			if (strlen(strval) > ZPROP_MAX_COMMENT)
 				error = E2BIG;
 			break;
 
 		case ZPOOL_PROP_DEDUPDITTO:
 			if (spa_version(spa) < SPA_VERSION_DEDUP)
 				error = SET_ERROR(ENOTSUP);
 			else
 				error = nvpair_value_uint64(elem, &intval);
 			if (error == 0 &&
 			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
 				error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (error)
 			break;
 	}
 
 	if (!error && reset_bootfs) {
 		error = nvlist_remove(props,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 
 		if (!error) {
 			error = nvlist_add_uint64(props,
 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 		}
 	}
 
 	return (error);
 }
 
 void
 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 {
 	char *cachefile;
 	spa_config_dirent_t *dp;
 
 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 	    &cachefile) != 0)
 		return;
 
 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
 	    KM_SLEEP);
 
 	if (cachefile[0] == '\0')
 		dp->scd_path = spa_strdup(spa_config_path);
 	else if (strcmp(cachefile, "none") == 0)
 		dp->scd_path = NULL;
 	else
 		dp->scd_path = spa_strdup(cachefile);
 
 	list_insert_head(&spa->spa_config_list, dp);
 	if (need_sync)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
 	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
 		if (prop == ZPOOL_PROP_CACHEFILE ||
 		    prop == ZPOOL_PROP_ALTROOT ||
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
 		if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
 			uint64_t ver;
 
 			if (prop == ZPOOL_PROP_VERSION) {
 				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 			} else {
 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
 				ver = SPA_VERSION_FEATURES;
 				need_sync = B_TRUE;
 			}
 
 			/* Save time if the version is already set. */
 			if (ver == spa_version(spa))
 				continue;
 
 			/*
 			 * In addition to the pool directory object, we might
 			 * create the pool properties object, the features for
 			 * read object, the features for write object, or the
 			 * feature descriptions object.
 			 */
 			error = dsl_sync_task(spa->spa_name, NULL,
 			    spa_sync_version, &ver,
 			    6, ZFS_SPACE_CHECK_RESERVED);
 			if (error)
 				return (error);
 			continue;
 		}
 
 		need_sync = B_TRUE;
 		break;
 	}
 
 	if (need_sync) {
 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	return (0);
 }
 
 /*
  * If the bootfs property value is dsobj, clear it.
  */
 void
 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 {
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
 
 /*ARGSUSED*/
 static int
 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (vdev_state != VDEV_STATE_HEALTHY)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3U(spa_guid(spa), !=, *newguid);
 
 	return (0);
 }
 
 static void
 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	uint64_t oldguid;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	oldguid = spa_guid(spa);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	rvd->vdev_guid = *newguid;
 	rvd->vdev_guid_sum += (*newguid - oldguid);
 	vdev_config_dirty(rvd);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 	    oldguid, *newguid);
 }
 
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
  * the root vdev's guid, our own pool guid, and then mark all of our
  * vdevs dirty.  Note that we must make sure that all our vdevs are
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
  */
 int
 spa_change_guid(spa_t *spa)
 {
 	int error;
 	uint64_t guid;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 	guid = spa_generate_guid(NULL);
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == 0) {
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 		spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
  * ==========================================================================
  */
 
 static int
 spa_error_entry_compare(const void *a, const void *b)
 {
 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
 	int ret;
 
 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_phys_t));
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Utility function which retrieves copies of the current logs and
  * re-initializes them in the process.
  */
 void
 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 {
 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 
 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 static void
 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 	enum zti_modes mode = ztip->zti_mode;
 	uint_t value = ztip->zti_value;
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	char name[32];
 	uint_t flags = 0;
 	boolean_t batch = B_FALSE;
 
 	if (mode == ZTI_MODE_NULL) {
 		tqs->stqs_count = 0;
 		tqs->stqs_taskq = NULL;
 		return;
 	}
 
 	ASSERT3U(count, >, 0);
 
 	tqs->stqs_count = count;
 	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >=, 1);
 		value = MAX(value, 1);
 		break;
 
 	case ZTI_MODE_BATCH:
 		batch = B_TRUE;
 		flags |= TASKQ_THREADS_CPU_PCT;
 		value = zio_taskq_batch_pct;
 		break;
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 		    "spa_activate()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
 
 	for (uint_t i = 0; i < count; i++) {
 		taskq_t *tq;
 
 		if (count > 1) {
 			(void) snprintf(name, sizeof (name), "%s_%s_%u",
 			    zio_type_name[t], zio_taskq_types[q], i);
 		} else {
 			(void) snprintf(name, sizeof (name), "%s_%s",
 			    zio_type_name[t], zio_taskq_types[q]);
 		}
 
 #ifdef SYSDC
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 			if (batch)
 				flags |= TASKQ_DC_BATCH;
 
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 #endif
 			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly lower priority
 			 * than the other taskqs.
 			 */
 			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
 				pri--;
 
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 #ifdef SYSDC
 		}
 #endif
 
 		tqs->stqs_taskq[i] = tq;
 	}
 }
 
 static void
 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 
 	if (tqs->stqs_taskq == NULL) {
 		ASSERT0(tqs->stqs_count);
 		return;
 	}
 
 	for (uint_t i = 0; i < tqs->stqs_count; i++) {
 		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 		taskq_destroy(tqs->stqs_taskq[i]);
 	}
 
 	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 	tqs->stqs_taskq = NULL;
 }
 
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
  * on the taskq itself. In that case we choose which taskq at random by using
  * the low bits of gethrtime().
  */
 void
 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
 
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else {
 #ifdef _KERNEL
 		tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count];
 #else
 		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
 #endif
 	}
 
 	taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
 static void
 spa_create_zio_taskqs(spa_t *spa)
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_init(spa, t, q);
 		}
 	}
 }
 
 #ifdef _KERNEL
 #ifdef SPA_PROCESS
 static void
 spa_thread(void *arg)
 {
 	callb_cpr_t cprinfo;
 
 	spa_t *spa = arg;
 	user_t *pu = PTOU(curproc);
 
 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 	    spa->spa_name);
 
 	ASSERT(curproc != &p0);
 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 	    "zpool-%s", spa->spa_name);
 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 
 #ifdef PSRSET_BIND
 	/* bind this thread to the requested psrset */
 	if (zio_taskq_psrset_bind != PS_NONE) {
 		pool_lock();
 		mutex_enter(&cpu_lock);
 		mutex_enter(&pidlock);
 		mutex_enter(&curproc->p_lock);
 
 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 		    0, NULL, NULL) == 0)  {
 			curthread->t_bind_pset = zio_taskq_psrset_bind;
 		} else {
 			cmn_err(CE_WARN,
 			    "Couldn't bind process for zfs pool \"%s\" to "
 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 		}
 
 		mutex_exit(&curproc->p_lock);
 		mutex_exit(&pidlock);
 		mutex_exit(&cpu_lock);
 		pool_unlock();
 	}
 #endif
 
 #ifdef SYSDC
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
 #endif
 
 	spa->spa_proc = curproc;
 	spa->spa_did = curthread->t_did;
 
 	spa_create_zio_taskqs(spa);
 
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 
 	spa->spa_proc_state = SPA_PROC_ACTIVE;
 	cv_broadcast(&spa->spa_proc_cv);
 
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 
 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 	spa->spa_proc_state = SPA_PROC_GONE;
 	spa->spa_proc = &p0;
 	cv_broadcast(&spa->spa_proc_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
 
 	mutex_enter(&curproc->p_lock);
 	lwp_exit();
 }
 #endif	/* SPA_PROCESS */
 #endif
 
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, int mode)
 {
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 
 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 	ASSERT(spa->spa_proc == &p0);
 	spa->spa_did = 0;
 
 #ifdef SPA_PROCESS
 	/* Only create a process if we're going to be around a while. */
 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 		    NULL, 0) == 0) {
 			spa->spa_proc_state = SPA_PROC_CREATED;
 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
 				cv_wait(&spa->spa_proc_cv,
 				    &spa->spa_proc_lock);
 			}
 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 			ASSERT(spa->spa_proc != &p0);
 			ASSERT(spa->spa_did != 0);
 		} else {
 #ifdef _KERNEL
 			cmn_err(CE_WARN,
 			    "Couldn't create process for zfs pool \"%s\"\n",
 			    spa->spa_name);
 #endif
 		}
 	}
 #endif	/* SPA_PROCESS */
 	mutex_exit(&spa->spa_proc_lock);
 
 	/* If we didn't create a process, we need to create our taskqs. */
 	ASSERT(spa->spa_proc == &p0);
 	if (spa->spa_proc == &p0) {
 		spa_create_zio_taskqs(spa);
 	}
 
 	/*
 	 * Start TRIM thread.
 	 */
 	trim_thread_create(spa);
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
 	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
 	    offsetof(objset_t, os_evicting_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list,
 	    offsetof(struct vdev, vdev_txg_node));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 /*
  * Opposite of spa_activate().
  */
 static void
 spa_deactivate(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
 	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	/*
 	 * Stop TRIM thread in case spa_unload() wasn't called directly
 	 * before spa_deactivate().
 	 */
 	trim_thread_destroy(spa);
 
 	spa_evicting_os_wait(spa);
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
 	list_destroy(&spa->spa_evicting_os_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_fini(spa, t, q);
 		}
 	}
 
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
 	metaslab_class_destroy(spa->spa_log_class);
 	spa->spa_log_class = NULL;
 
 	/*
 	 * If this was part of an import or the open otherwise failed, we may
 	 * still have errors left in the queues.  Empty them just in case.
 	 */
 	spa_errlog_drain(spa);
 
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 
 	mutex_enter(&spa->spa_proc_lock);
 	if (spa->spa_proc_state != SPA_PROC_NONE) {
 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 		cv_broadcast(&spa->spa_proc_cv);
 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 			ASSERT(spa->spa_proc != &p0);
 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 		}
 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 		spa->spa_proc_state = SPA_PROC_NONE;
 	}
 	ASSERT(spa->spa_proc == &p0);
 	mutex_exit(&spa->spa_proc_lock);
 
 #ifdef SPA_PROCESS
 	/*
 	 * We want to make sure spa_thread() has actually exited the ZFS
 	 * module, so that the module can't be unloaded out from underneath
 	 * it.
 	 */
 	if (spa->spa_did != 0) {
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
 #endif	/* SPA_PROCESS */
 }
 
 /*
  * Verify a pool configuration, and construct the vdev tree appropriately.  This
  * will create all the necessary vdevs in the appropriate layout, with each vdev
  * in the CLOSED state.  This will prep the pool before open/creation/import.
  * All vdev validation is done by the vdev_alloc() routine.
  */
 static int
 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
 	nvlist_t **child;
 	uint_t children;
 	int error;
 
 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 		return (error);
 
 	if ((*vdp)->vdev_ops->vdev_op_leaf)
 		return (0);
 
 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *vd;
 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 		    atype)) != 0) {
 			vdev_free(*vdp);
 			*vdp = NULL;
 			return (error);
 		}
 	}
 
 	ASSERT(*vdp != NULL);
 
 	return (0);
 }
 
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
 	int i;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Stop TRIM thread.
 	 */
 	trim_thread_destroy(spa);
 
 	/*
 	 * Stop async tasks.
 	 */
 	spa_async_suspend(spa);
 
 	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
 		txg_sync_stop(spa->spa_dsl_pool);
 		spa->spa_sync_on = B_FALSE;
 	}
 
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
 		for (int i = 0; i < max_ncpus; i++)
 			(void) zio_wait(spa->spa_async_zio_root[i]);
 		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
 		spa->spa_async_zio_root = NULL;
 	}
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
 	 */
 	if (spa->spa_root_vdev)
 		vdev_free(spa->spa_root_vdev);
 	ASSERT(spa->spa_root_vdev == NULL);
 
 	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
 		dsl_pool_close(spa->spa_dsl_pool);
 		spa->spa_dsl_pool = NULL;
 		spa->spa_meta_objset = NULL;
 	}
 
 	ddt_unload(spa);
 
 
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
 	}
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 	}
 	spa->spa_spares.sav_count = 0;
 
 	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 	}
 	if (spa->spa_l2cache.sav_vdevs) {
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 	}
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
 
 	if (spa->spa_comment != NULL) {
 		spa_strfree(spa->spa_comment);
 		spa->spa_comment = NULL;
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active spares for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  */
 static void
 spa_load_spares(spa_t *spa)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int i;
 	vdev_t *vd, *tvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		vd = spa->spa_spares.sav_vdevs[i];
 
 		/* Undo the call to spa_activate() below */
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL && tvd->vdev_isspare)
 			spa_spare_remove(tvd);
 		vdev_close(vd);
 		vdev_free(vd);
 	}
 
 	if (spa->spa_spares.sav_vdevs)
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
 	else
 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 	spa->spa_spares.sav_count = (int)nspares;
 	spa->spa_spares.sav_vdevs = NULL;
 
 	if (nspares == 0)
 		return;
 
 	/*
 	 * Construct the array of vdevs, opening them to get status in the
 	 * process.   For each spare, there is potentially two different vdev_t
 	 * structures associated with it: one in the list of spares (used only
 	 * for basic validation purposes) and one in the active vdev
 	 * configuration (if it's spared in).  During this phase we open and
 	 * validate each vdev on the spare list.  If the vdev also exists in the
 	 * active configuration, then we also mark this vdev as an active spare.
 	 */
 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE) == 0);
 		ASSERT(vd != NULL);
 
 		spa->spa_spares.sav_vdevs[i] = vd;
 
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL) {
 			if (!tvd->vdev_isspare)
 				spa_spare_add(tvd);
 
 			/*
 			 * We only mark the spare active if we were successfully
 			 * able to load the vdev.  Otherwise, importing a pool
 			 * with a bad active spare would result in strange
 			 * behavior, because multiple pool would think the spare
 			 * is actively in use.
 			 *
 			 * There is a vulnerability here to an equally bizarre
 			 * circumstance, where a dead active spare is later
 			 * brought back to life (onlined or otherwise).  Given
 			 * the rarity of this scenario, and the extra complexity
 			 * it adds, we ignore the possibility.
 			 */
 			if (!vdev_is_dead(tvd))
 				spa_spare_activate(tvd);
 		}
 
 		vd->vdev_top = vd;
 		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
 
 		if (vdev_validate_aux(vd) == 0)
 			spa_spare_add(vd);
 	}
 
 	/*
 	 * Recompute the stashed list of spares, with status information
 	 * this time.
 	 */
 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active l2cache for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  * Devices which are already active have their details maintained, and are
  * not re-opened.
  */
 static void
 spa_load_l2cache(spa_t *spa)
 {
 	nvlist_t **l2cache;
 	uint_t nl2cache;
 	int i, j, oldnvdevs;
 	uint64_t guid;
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (sav->sav_config != NULL) {
 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 	} else {
 		nl2cache = 0;
 		newvdevs = NULL;
 	}
 
 	oldvdevs = sav->sav_vdevs;
 	oldnvdevs = sav->sav_count;
 	sav->sav_vdevs = NULL;
 	sav->sav_count = 0;
 
 	/*
 	 * Process new nvlist of vdevs.
 	 */
 	for (i = 0; i < nl2cache; i++) {
 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
 		    &guid) == 0);
 
 		newvdevs[i] = NULL;
 		for (j = 0; j < oldnvdevs; j++) {
 			vd = oldvdevs[j];
 			if (vd != NULL && guid == vd->vdev_guid) {
 				/*
 				 * Retain previous vdev for add/remove ops.
 				 */
 				newvdevs[i] = vd;
 				oldvdevs[j] = NULL;
 				break;
 			}
 		}
 
 		if (newvdevs[i] == NULL) {
 			/*
 			 * Create new vdev
 			 */
 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
 			    VDEV_ALLOC_L2CACHE) == 0);
 			ASSERT(vd != NULL);
 			newvdevs[i] = vd;
 
 			/*
 			 * Commit this vdev as an l2cache device,
 			 * even if it fails to open.
 			 */
 			spa_l2cache_add(vd);
 
 			vd->vdev_top = vd;
 			vd->vdev_aux = sav;
 
 			spa_l2cache_activate(vd);
 
 			if (vdev_open(vd) != 0)
 				continue;
 
 			(void) vdev_validate_aux(vd);
 
 			if (!vdev_is_dead(vd))
 				l2arc_add_vdev(spa, vd);
 		}
 	}
 
 	/*
 	 * Purge vdevs that were dropped
 	 */
 	for (i = 0; i < oldnvdevs; i++) {
 		uint64_t pool;
 
 		vd = oldvdevs[i];
 		if (vd != NULL) {
 			ASSERT(vd->vdev_isl2cache);
 
 			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 			    pool != 0ULL && l2arc_vdev_present(vd))
 				l2arc_remove_vdev(vd);
 			vdev_clear_stats(vd);
 			vdev_free(vd);
 		}
 	}
 
 	if (oldvdevs)
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
 
 	if (sav->sav_config == NULL)
 		goto out;
 
 	sav->sav_vdevs = newvdevs;
 	sav->sav_count = (int)nl2cache;
 
 	/*
 	 * Recompute the stashed list of l2cache devices, with status
 	 * information this time.
 	 */
 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
 out:
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);
 	if (sav->sav_count)
 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
 }
 
 static int
 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 {
 	dmu_buf_t *db;
 	char *packed = NULL;
 	size_t nvsize = 0;
 	int error;
 	*value = NULL;
 
 	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
 	if (error != 0)
 		return (error);
 
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
 	packed = kmem_alloc(nvsize, KM_SLEEP);
 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
 	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	kmem_free(packed, nvsize);
 
 	return (error);
 }
 
 /*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
  */
 static void
 spa_check_removed(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
 	    !vd->vdev_ishole) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
 /*
  * Validate the current config against the MOS config
  */
 static boolean_t
 spa_config_valid(spa_t *spa, nvlist_t *config)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
 	nvlist_t *nv;
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
 
 	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
 
 	/*
 	 * If we're doing a normal import, then build up any additional
 	 * diagnostic information about missing devices in this config.
 	 * We'll pass this up to the user for further processing.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
 		nvlist_t **child, *nv;
 		uint64_t idx = 0;
 
 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
 		    KM_SLEEP);
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 			vdev_t *mtvd  = mrvd->vdev_child[c];
 
 			if (tvd->vdev_ops == &vdev_missing_ops &&
 			    mtvd->vdev_ops != &vdev_missing_ops &&
 			    mtvd->vdev_islog)
 				child[idx++] = vdev_config_generate(spa, mtvd,
 				    B_FALSE, 0);
 		}
 
 		if (idx) {
 			VERIFY(nvlist_add_nvlist_array(nv,
 			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
 
 			for (int i = 0; i < idx; i++)
 				nvlist_free(child[i]);
 		}
 		nvlist_free(nv);
 		kmem_free(child, rvd->vdev_children * sizeof (char **));
 	}
 
 	/*
 	 * Compare the root vdev tree with the information we have
 	 * from the MOS config (mrvd). Check each top-level vdev
 	 * with the corresponding MOS config top-level (mtvd).
 	 */
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		vdev_t *mtvd  = mrvd->vdev_child[c];
 
 		/*
 		 * Resolve any "missing" vdevs in the current configuration.
 		 * If we find that the MOS config has more accurate information
 		 * about the top-level vdev then use that vdev instead.
 		 */
 		if (tvd->vdev_ops == &vdev_missing_ops &&
 		    mtvd->vdev_ops != &vdev_missing_ops) {
 
 			if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
 				continue;
 
 			/*
 			 * Device specific actions.
 			 */
 			if (mtvd->vdev_islog) {
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
 			} else {
 				/*
 				 * XXX - once we have 'readonly' pool
 				 * support we should be able to handle
 				 * missing data devices by transitioning
 				 * the pool to readonly.
 				 */
 				continue;
 			}
 
 			/*
 			 * Swap the missing vdev with the data we were
 			 * able to obtain from the MOS config.
 			 */
 			vdev_remove_child(rvd, tvd);
 			vdev_remove_child(mrvd, mtvd);
 
 			vdev_add_child(rvd, mtvd);
 			vdev_add_child(mrvd, tvd);
 
 			spa_config_exit(spa, SCL_ALL, FTAG);
 			vdev_load(mtvd);
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 			vdev_reopen(rvd);
 		} else if (mtvd->vdev_islog) {
 			/*
 			 * Load the slog device's state from the MOS config
 			 * since it's possible that the label does not
 			 * contain the most up-to-date information.
 			 */
 			vdev_load_log_state(tvd, mtvd);
 			vdev_reopen(tvd);
 		}
 	}
 	vdev_free(mrvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Ensure we were able to validate the config.
 	 */
 	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
 }
 
 /*
  * Check for missing log devices
  */
 static boolean_t
 spa_check_logs(spa_t *spa)
 {
 	boolean_t rv = B_FALSE;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	switch (spa->spa_log_state) {
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
 		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
 		if (rv)
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 		break;
 	}
 	return (rv);
 }
 
 static boolean_t
 spa_passivate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	boolean_t slog_found = B_FALSE;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	if (!spa_has_slogs(spa))
 		return (B_FALSE);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog) {
 			metaslab_group_passivate(mg);
 			slog_found = B_TRUE;
 		}
 	}
 
 	return (slog_found);
 }
 
 static void
 spa_activate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog)
 			metaslab_group_activate(mg);
 	}
 }
 
 int
 spa_offline_log(spa_t *spa)
 {
 	int error;
 
 	error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
 	    NULL, DS_FIND_CHILDREN);
 	if (error == 0) {
 		/*
 		 * We successfully offlined the log device, sync out the
 		 * current txg so that the "stubby" block can be removed
 		 * by zil_sync().
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 	return (error);
 }
 
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
 	int i;
 
 	for (i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
 void
 spa_claim_notify(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	if (zio->io_error)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
 	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
 		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
 	mutex_exit(&spa->spa_props_lock);
 }
 
 typedef struct spa_load_error {
 	uint64_t	sle_meta_count;
 	uint64_t	sle_data_count;
 } spa_load_error_t;
 
 static void
 spa_load_verify_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
 	spa_t *spa = zio->io_spa;
 
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
 			atomic_inc_64(&sle->sle_meta_count);
 		else
 			atomic_inc_64(&sle->sle_data_count);
 	}
 	zio_data_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /*
  * Maximum number of concurrent scrub i/os to create while verifying
  * a pool while importing it.
  */
 int spa_load_verify_maxinflight = 10000;
 boolean_t spa_load_verify_metadata = B_TRUE;
 boolean_t spa_load_verify_data = B_TRUE;
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
     &spa_load_verify_maxinflight, 0,
     "Maximum number of concurrent scrub I/Os to create while verifying a "
     "pool while importing it");
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
     &spa_load_verify_metadata, 0,
     "Check metadata on import?");
  
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
     &spa_load_verify_data, 0,
     "Check user data on import?");
  
 /*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 	/*
 	 * Note: normally this routine will not be called if
 	 * spa_load_verify_metadata is not set.  However, it may be useful
 	 * to manually set the flag after the traversal has begun.
 	 */
 	if (!spa_load_verify_metadata)
 		return (0);
 	if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
 		return (0);
 
 	zio_t *rio = arg;
 	size_t size = BP_GET_PSIZE(bp);
 	void *data = zio_data_buf_alloc(size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	spa->spa_scrub_inflight++;
 	mutex_exit(&spa->spa_scrub_lock);
 
 	zio_nowait(zio_read(rio, spa, bp, data, size,
 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	return (0);
 }
 
 static int
 spa_load_verify(spa_t *spa)
 {
 	zio_t *rio;
 	spa_load_error_t sle = { 0 };
 	zpool_rewind_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
 	int error = 0;
 
 	zpool_get_rewind_policy(spa->spa_config, &policy);
 
 	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
 		return (0);
 
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
 	if (spa_load_verify_metadata) {
 		error = traverse_pool(spa, spa->spa_verify_min_txg,
 		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
 		    spa_load_verify_cb, rio);
 	}
 
 	(void) zio_wait(rio);
 
 	spa->spa_load_meta_errors = sle.sle_meta_count;
 	spa->spa_load_data_errors = sle.sle_data_count;
 
 	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
 	    sle.sle_data_count <= policy.zrp_maxdata) {
 		int64_t loss = 0;
 
 		verify_ok = B_TRUE;
 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
 		VERIFY(nvlist_add_int64(spa->spa_load_info,
 		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 	}
 
 	if (error) {
 		if (error != ENXIO && error != EIO)
 			error = SET_ERROR(EIO);
 		return (error);
 	}
 
 	return (verify_ok ? 0 : EIO);
 }
 
 /*
  * Find a value in the pool props object.
  */
 static void
 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
 {
 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
 }
 
 /*
  * Find a value in the pool directory object.
  */
 static int
 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
 {
 	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    name, sizeof (uint64_t), 1, val));
 }
 
 static int
 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 {
 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
 	return (err);
 }
 
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
  * pool have that entry in their config, but only the splitting one contains
  * a list of all the guids of the vdevs that are being split off.
  *
  * This function determines what to do with that list: either rejoin
  * all the disks to the pool, or complete the splitting process.  To attempt
  * the rejoin, each disk that is offlined is marked online again, and
  * we do a reopen() call.  If the vdev label for every disk that was
  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
  * then we call vdev_split() on each disk, and complete the split.
  *
  * Otherwise we leave the config alone, with all the vdevs in place in
  * the original pool.
  */
 static void
 spa_try_repair(spa_t *spa, nvlist_t *config)
 {
 	uint_t extracted;
 	uint64_t *glist;
 	uint_t i, gcount;
 	nvlist_t *nvl;
 	vdev_t **vd;
 	boolean_t attempt_reopen;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
 		return;
 
 	/* check that the config is complete */
 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    &glist, &gcount) != 0)
 		return;
 
 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
 
 	/* attempt to online all the vdevs & validate */
 	attempt_reopen = B_TRUE;
 	for (i = 0; i < gcount; i++) {
 		if (glist[i] == 0)	/* vdev is hole */
 			continue;
 
 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
 		if (vd[i] == NULL) {
 			/*
 			 * Don't bother attempting to reopen the disks;
 			 * just do the split.
 			 */
 			attempt_reopen = B_FALSE;
 		} else {
 			/* attempt to re-online it */
 			vd[i]->vdev_offline = B_FALSE;
 		}
 	}
 
 	if (attempt_reopen) {
 		vdev_reopen(spa->spa_root_vdev);
 
 		/* check each device to see what state it's in */
 		for (extracted = 0, i = 0; i < gcount; i++) {
 			if (vd[i] != NULL &&
 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
 				break;
 			++extracted;
 		}
 	}
 
 	/*
 	 * If every disk has been moved to the new pool, or if we never
 	 * even attempted to look at them, then we split them off for
 	 * good.
 	 */
 	if (!attempt_reopen || gcount == extracted) {
 		for (i = 0; i < gcount; i++)
 			if (vd[i] != NULL)
 				vdev_split(vd[i]);
 		vdev_reopen(spa->spa_root_vdev);
 	}
 
 	kmem_free(vd, gcount * sizeof (vdev_t *));
 }
 
 static int
 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
     boolean_t mosconfig)
 {
 	nvlist_t *config = spa->spa_config;
 	char *ereport = FM_EREPORT_ZFS_POOL;
 	char *comment;
 	int error;
 	uint64_t pool_guid;
 	nvlist_t *nvl;
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
 		return (SET_ERROR(EINVAL));
 
 	ASSERT(spa->spa_comment == NULL);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 		spa->spa_comment = spa_strdup(comment);
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
 	 * it's not present treat it as the initial version.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &spa->spa_ubsync.ub_version) != 0)
 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
 		error = SET_ERROR(EEXIST);
 	} else {
 		spa->spa_config_guid = pool_guid;
 
 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
 		    &nvl) == 0) {
 			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
 			    KM_SLEEP) == 0);
 		}
 
 		nvlist_free(spa->spa_load_info);
 		spa->spa_load_info = fnvlist_alloc();
 
 		gethrestime(&spa->spa_loaded_ts);
 		error = spa_load_impl(spa, pool_guid, config, state, type,
 		    mosconfig, &ereport);
 	}
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 	if (error) {
 		if (error != EEXIST) {
 			spa->spa_loaded_ts.tv_sec = 0;
 			spa->spa_loaded_ts.tv_nsec = 0;
 		}
 		if (error != EBADF) {
 			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
 		}
 	}
 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
 	return (error);
 }
 
 /*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
 static int
 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport)
 {
 	int error = 0;
 	nvlist_t *nvroot = NULL;
 	nvlist_t *label;
 	vdev_t *rvd;
 	uberblock_t *ub = &spa->spa_uberblock;
 	uint64_t children, config_cache_txg = spa->spa_config_txg;
 	int orig_mode = spa->spa_mode;
 	int parse;
 	uint64_t obj;
 	boolean_t missing_feat_write = B_FALSE;
 
 	/*
 	 * If this is an untrusted config, access the pool in read-only mode.
 	 * This prevents things like resilvering recently removed devices.
 	 */
 	if (!mosconfig)
 		spa->spa_mode = FREAD;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa->spa_load_state = state;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
 		return (SET_ERROR(EINVAL));
 
 	parse = (type == SPA_IMPORT_EXISTING ?
 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT(spa->spa_root_vdev == rvd);
 	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
 	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
 	}
 
 	/*
 	 * Try to open all vdevs, loading each label in the process.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * We need to validate the vdev labels against the configuration that
 	 * we have in hand, which is dependent on the setting of mosconfig. If
 	 * mosconfig is true then we're validating the vdev labels based on
 	 * that config.  Otherwise, we're validating against the cached config
 	 * (zpool.cache) that was read when we loaded the zfs module, and then
 	 * later we will recursively call spa_load() and validate against
 	 * the vdev config.
 	 *
 	 * If we're assembling a new pool that's been split off from an
 	 * existing pool, the labels haven't yet been updated so we skip
 	 * validation for now.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		error = vdev_validate(rvd, mosconfig);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 
 		if (error != 0)
 			return (error);
 
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
 			return (SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Find the best uberblock.
 	 */
 	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
 		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
 	}
 
 	/*
 	 * If the pool has an unsupported version we can't open it.
 	 */
 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
 		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
 	}
 
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *features;
 
 		/*
 		 * If we weren't able to find what's necessary for reading the
 		 * MOS in the label, return failure.
 		 */
 		if (label == NULL || nvlist_lookup_nvlist(label,
 		    ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
 			nvlist_free(label);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		/*
 		 * Update our in-core representation with the definitive values
 		 * from the label.
 		 */
 		nvlist_free(spa->spa_label_features);
 		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Look through entries in the label nvlist's features_for_read. If
 	 * there is a feature listed there which we don't understand then we
 	 * cannot open a pool.
 	 */
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *unsup_feat;
 
 		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
 		    0);
 
 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
 		    NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
 			if (!zfeature_is_supported(nvpair_name(nvp))) {
 				VERIFY(nvlist_add_string(unsup_feat,
 				    nvpair_name(nvp), "") == 0);
 			}
 		}
 
 		if (!nvlist_empty(unsup_feat)) {
 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
 			nvlist_free(unsup_feat);
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		nvlist_free(unsup_feat);
 	}
 
 	/*
 	 * If the vdev guid sum doesn't match the uberblock, we have an
 	 * incomplete configuration.  We first check to see if the pool
 	 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
 	 * If it is, defer the vdev_guid_sum check till later so we
 	 * can handle missing vdevs.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 	    &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
 	    rvd->vdev_guid_sum != ub->ub_guid_sum)
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_try_repair(spa, config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		nvlist_free(spa->spa_config_splitting);
 		spa->spa_config_splitting = NULL;
 	}
 
 	/*
 	 * Initialize internal SPA structures.
 	 */
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
 	spa->spa_prev_software_version = ub->ub_software_version;
 
 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
 		boolean_t missing_feat_read = B_FALSE;
 		nvlist_t *unsup_feat, *enabled_feat;
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
 		    &spa->spa_feat_for_read_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
 		    &spa->spa_feat_for_write_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
 		    &spa->spa_feat_desc_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		enabled_feat = fnvlist_alloc();
 		unsup_feat = fnvlist_alloc();
 
 		if (!spa_features_check(spa, B_FALSE,
 		    unsup_feat, enabled_feat))
 			missing_feat_read = B_TRUE;
 
 		if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
 			if (!spa_features_check(spa, B_TRUE,
 			    unsup_feat, enabled_feat)) {
 				missing_feat_write = B_TRUE;
 			}
 		}
 
 		fnvlist_add_nvlist(spa->spa_load_info,
 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 		}
 
 		fnvlist_free(enabled_feat);
 		fnvlist_free(unsup_feat);
 
 		if (!missing_feat_read) {
 			fnvlist_add_boolean(spa->spa_load_info,
 			    ZPOOL_CONFIG_CAN_RDONLY);
 		}
 
 		/*
 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
 		 * twofold: to determine whether the pool is available for
 		 * import in read-write mode and (if it is not) whether the
 		 * pool is available for import in read-only mode. If the pool
 		 * is available for import in read-write mode, it is displayed
 		 * as available in userland; if it is not available for import
 		 * in read-only mode, it is displayed as unavailable in
 		 * userland. If the pool is available for import in read-only
 		 * mode but not read-write mode, it is displayed as unavailable
 		 * in userland with a special note that the pool is actually
 		 * available for open in read-only mode.
 		 *
 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
 		 * missing a feature for write, we must first determine whether
 		 * the pool can be opened read-only before returning to
 		 * userland in order to know whether to display the
 		 * abovementioned note.
 		 */
 		if (missing_feat_read || (missing_feat_write &&
 		    spa_writeable(spa))) {
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		/*
 		 * Load refcounts for ZFS features from disk into an in-memory
 		 * cache during SPA initialization.
 		 */
 		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 			uint64_t refcount;
 
 			error = feature_get_refcount_from_disk(spa,
 			    &spa_feature_table[i], &refcount);
 			if (error == 0) {
 				spa->spa_feat_refcount_cache[i] = refcount;
 			} else if (error == ENOTSUP) {
 				spa->spa_feat_refcount_cache[i] =
 				    SPA_FEATURE_DISABLED;
 			} else {
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 		}
 	}
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
 		    &spa->spa_feat_enabled_txg_obj) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	error = dsl_pool_open(spa->spa_dsl_pool);
 	spa->spa_is_initializing = B_FALSE;
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (!mosconfig) {
 		uint64_t hostid;
 		nvlist_t *policy = NULL, *nvconfig;
 
 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 			char *hostname;
 			unsigned long myhostid = 0;
 
 			VERIFY(nvlist_lookup_string(nvconfig,
 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
 
 #ifdef	_KERNEL
 			myhostid = zone_get_hostid(NULL);
 #else	/* _KERNEL */
 			/*
 			 * We're emulating the system's hostid in userland, so
 			 * we can't use zone_get_hostid().
 			 */
 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
 #endif	/* _KERNEL */
 			if (check_hostid && hostid != 0 && myhostid != 0 &&
 			    hostid != myhostid) {
 				nvlist_free(nvconfig);
 				cmn_err(CE_WARN, "pool '%s' could not be "
 				    "loaded as it was last accessed by "
 				    "another system (host: %s hostid: 0x%lx). "
 				    "See: http://illumos.org/msg/ZFS-8000-EY",
 				    spa_name(spa), hostname,
 				    (unsigned long)hostid);
 				return (SET_ERROR(EBADF));
 			}
 		}
 		if (nvlist_lookup_nvlist(spa->spa_config,
 		    ZPOOL_REWIND_POLICY, &policy) == 0)
 			VERIFY(nvlist_add_nvlist(nvconfig,
 			    ZPOOL_REWIND_POLICY, policy) == 0);
 
 		spa_config_set(spa, nvconfig);
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_activate(spa, orig_mode);
 
 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
 	}
 
 	/* Grab the secret checksum salt from the MOS. */
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT, 1,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes),
 	    spa->spa_cksum_salt.zcs_bytes);
 	if (error == ENOENT) {
 		/* Generate a new salt for subsequent use */
 		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes));
 	} else if (error != 0) {
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the bit that tells us to use the new accounting function
 	 * (raid-z deflation).  If we have an older pool, this will not
 	 * be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
 	    &spa->spa_creation_version);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
 	    &spa->spa_errlog_scrub);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * If we're assembling the pool from the split-off vdevs of
 	 * an existing pool, we don't want to attach the spares & cache
 	 * devices.
 	 */
 
 	/*
 	 * Load any hot spares for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 		if (load_nvlist(spa, spa->spa_spares.sav_object,
 		    &spa->spa_spares.sav_config) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Load any level 2 ARC devices for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 	    &spa->spa_l2cache.sav_object);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
 		    &spa->spa_l2cache.sav_config) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
 	if (error && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (error == 0) {
 		uint64_t autoreplace;
 
 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
 		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
 		    &spa->spa_dedup_ditto);
 
 		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
 	/*
 	 * If the 'autoreplace' property is set, then post a resource notifying
 	 * the ZFS DE that it should not issue any faults for unopenable
 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
 	 * unopenable vdevs so that the normal autoreplace handler can take
 	 * over.
 	 */
 	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
 		spa_check_removed(spa->spa_root_vdev);
 		/*
 		 * For the import case, this is done in spa_import(), because
 		 * at this point we're using the spare definitions from
 		 * the MOS config, not necessarily from the userland config.
 		 */
 		if (state != SPA_LOAD_IMPORT) {
 			spa_aux_check_removed(&spa->spa_spares);
 			spa_aux_check_removed(&spa->spa_l2cache);
 		}
 	}
 
 	/*
 	 * Load the vdev state for all toplevel vdevs.
 	 */
 	vdev_load(rvd);
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Load the DDTs (dedup tables).
 	 */
 	error = ddt_load(spa);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	spa_update_dspace(spa);
 
 	/*
 	 * Validate the config, using the MOS config to fill in any
 	 * information which might be missing.  If we fail to validate
 	 * the config then declare the pool unfit for use. If we're
 	 * assembling a pool from a split, the log is not transferred
 	 * over.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		nvlist_t *nvconfig;
 
 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		if (!spa_config_valid(spa, nvconfig)) {
 			nvlist_free(nvconfig);
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
 			    ENXIO));
 		}
 		nvlist_free(nvconfig);
 
 		/*
 		 * Now that we've validated the config, check the state of the
 		 * root vdev.  If it can't be opened, it indicates one or
 		 * more toplevel vdevs are faulted.
 		 */
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
 			return (SET_ERROR(ENXIO));
 
 		if (spa_writeable(spa) && spa_check_logs(spa)) {
 			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
 		}
 	}
 
 	if (missing_feat_write) {
 		ASSERT(state == SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * At this point, we know that we can open the pool in
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
 		return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
 	}
 
 	/*
 	 * We've successfully opened the pool, verify that we're ready
 	 * to start pushing transactions.
 	 */
 	if (state != SPA_LOAD_TRYIMPORT) {
 		if (error = spa_load_verify(spa))
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    error));
 	}
 
 	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
 		dsl_pool_t *dp = spa_get_dsl(spa);
 
 		ASSERT(state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Claim log blocks that haven't been committed yet.
 		 * This must all happen in a single txg.
 		 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
 		 * invoked from zil_claim_log_block()'s i/o done callback.
 		 * Price of rollback is that we abandon the log.
 		 */
 		spa->spa_claiming = B_TRUE;
 
 		tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
 		(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
 		spa->spa_claiming = B_FALSE;
 
 		spa_set_log_state(spa, SPA_LOG_GOOD);
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 
 		/*
 		 * Wait for all claims to sync.  We sync up to the highest
 		 * claimed log block birth time so that claimed log blocks
 		 * don't appear to be from the future.  spa_claim_max_txg
 		 * will have been set for us by either zil_check_log_chain()
 		 * (invoked from spa_check_logs()) or zil_claim() above.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * If the config cache is stale, or we have uninitialized
 		 * metaslabs (see spa_vdev_add()), then update the config.
 		 *
 		 * If this is a verbatim import, trust the current
 		 * in-core spa_config and update the disk labels.
 		 */
 		if (config_cache_txg != spa->spa_config_txg ||
 		    state == SPA_LOAD_IMPORT ||
 		    state == SPA_LOAD_RECOVER ||
 		    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 			need_update = B_TRUE;
 
 		for (int c = 0; c < rvd->vdev_children; c++)
 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
 				need_update = B_TRUE;
 
 		/*
 		 * Update the config cache asychronously in case we're the
 		 * root pool, in which case the config cache isn't writable yet.
 		 */
 		if (need_update)
 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 
 		/*
 		 * Check all DTLs to see if anything needs resilvering.
 		 */
 		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    vdev_resilver_needed(rvd, NULL, NULL))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 		/*
 		 * Log the fact that we booted up (so that we can detect if
 		 * we rebooted in the middle of an operation).
 		 */
 		spa_history_log_version(spa, "open");
 
 		/*
 		 * Delete any inconsistent datasets.
 		 */
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 	}
 
 	return (0);
 }
 
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
 {
 	int mode = spa->spa_mode;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 
 	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
 
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
 }
 
 /*
  * If spa_load() fails this function will try loading prior txg's. If
  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
  * function will not rewind the pool and will return the same error as
  * spa_load().
  */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
     uint64_t max_request, int rewind_flags)
 {
 	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
 	uint64_t min_txg;
 
 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
 		spa->spa_load_max_txg = spa->spa_load_txg;
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
 		if (max_request != UINT64_MAX)
 			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
 	    mosconfig);
 	if (load_error == 0)
 		return (0);
 
 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
 		nvlist_free(config);
 		return (load_error);
 	}
 
 	if (state == SPA_LOAD_RECOVER) {
 		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		/*
 		 * If we aren't rolling back save the load info from our first
 		 * import attempt so that we can restore it after attempting
 		 * to rewind.
 		 */
 		loadinfo = spa->spa_load_info;
 		spa->spa_load_info = fnvlist_alloc();
 	}
 
 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 	    TXG_INITIAL : safe_rewind_txg;
 
 	/*
 	 * Continue as long as we're finding errors, we're still within
 	 * the acceptable rewind range, and we're still finding uberblocks
 	 */
 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 		if (spa->spa_load_max_txg < safe_rewind_txg)
 			spa->spa_extreme_rewind = B_TRUE;
 		rewind_error = spa_load_retry(spa, state, mosconfig);
 	}
 
 	spa->spa_extreme_rewind = B_FALSE;
 	spa->spa_load_max_txg = UINT64_MAX;
 
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 
 	if (state == SPA_LOAD_RECOVER) {
 		ASSERT3P(loadinfo, ==, NULL);
 		return (rewind_error);
 	} else {
 		/* Store the rewind info as part of the initial load info */
 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
 		    spa->spa_load_info);
 
 		/* Restore the initial load info */
 		fnvlist_free(spa->spa_load_info);
 		spa->spa_load_info = loadinfo;
 
 		return (load_error);
 	}
 }
 
 /*
  * Pool Open/Import
  *
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
  * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
  * ambiguous state.
  */
 static int
 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
     nvlist_t **config)
 {
 	spa_t *spa;
 	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 	int firstopen = B_FALSE;
 
 	*spapp = NULL;
 
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
 	 * up calling spa_open() again.  The real fix is to figure out how to
 	 * avoid dsl_dir_open() calling this in the first place.
 	 */
 	if (mutex_owner(&spa_namespace_lock) != curthread) {
 		mutex_enter(&spa_namespace_lock);
 		locked = B_TRUE;
 	}
 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
 			mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		zpool_rewind_policy_t policy;
 
 		firstopen = B_TRUE;
 
 		zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
 		    &policy);
 		if (policy.zrp_request & ZPOOL_DO_REWIND)
 			state = SPA_LOAD_RECOVER;
 
 		spa_activate(spa, spa_mode_global);
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 
 		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
 		    policy.zrp_request);
 
 		if (error == EBADF) {
 			/*
 			 * If vdev_validate() returns failure (indicated by
 			 * EBADF), it indicates that one of the vdevs indicates
 			 * that the pool has been exported or destroyed.  If
 			 * this is the case, the config cache is out of sync and
 			 * we should remove the pool from the namespace.
 			 */
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa_config_sync(spa, B_TRUE, B_TRUE);
 			spa_remove(spa);
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(ENOENT));
 		}
 
 		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
 			if (config != NULL && spa->spa_config) {
 				VERIFY(nvlist_dup(spa->spa_config, config,
 				    KM_SLEEP) == 0);
 				VERIFY(nvlist_add_nvlist(*config,
 				    ZPOOL_CONFIG_LOAD_INFO,
 				    spa->spa_load_info) == 0);
 			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
 		}
 	}
 
 	spa_open_ref(spa, tag);
 
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	/*
 	 * If we've recovered the pool, pass back any information we
 	 * gathered while doing the load.
 	 */
 	if (state == SPA_LOAD_RECOVER) {
 		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 	}
 
 	if (locked) {
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
 		mutex_exit(&spa_namespace_lock);
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 		if (firstopen)
 			zvol_create_minors(spa->spa_name);
 #endif
 #endif
 	}
 
 	*spapp = spa;
 
 	return (0);
 }
 
 int
 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
     nvlist_t **config)
 {
 	return (spa_open_common(name, spapp, tag, policy, config));
 }
 
 int
 spa_open(const char *name, spa_t **spapp, void *tag)
 {
 	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
  * Lookup the given spa_t, incrementing the inject count in the process,
  * preventing it from being exported or destroyed.
  */
 spa_t *
 spa_inject_addref(char *name)
 {
 	spa_t *spa;
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(name)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (NULL);
 	}
 	spa->spa_inject_ref++;
 	mutex_exit(&spa_namespace_lock);
 
 	return (spa);
 }
 
 void
 spa_inject_delref(spa_t *spa)
 {
 	mutex_enter(&spa_namespace_lock);
 	spa->spa_inject_ref--;
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * Add spares device information to the nvlist.
  */
 static void
 spa_add_spares(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **spares;
 	uint_t i, nspares;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_stat_t *vs;
 	uint_t vsc;
 	uint64_t pool;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 	if (nspares != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 		/*
 		 * Go through and find any spares which have since been
 		 * repurposed as an active spare.  If this is the case, update
 		 * their status appropriately.
 		 */
 		for (i = 0; i < nspares; i++) {
 			VERIFY(nvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				VERIFY(nvlist_lookup_uint64_array(
 				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
 				    (uint64_t **)&vs, &vsc) == 0);
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
 			}
 		}
 	}
 }
 
 /*
  * Add l2cache device information to the nvlist, including vdev stats.
  */
 static void
 spa_add_l2cache(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **l2cache;
 	uint_t i, j, nl2cache;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_t *vd;
 	vdev_stat_t *vs;
 	uint_t vsc;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 	if (nl2cache != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 
 		/*
 		 * Update level 2 cache device stats.
 		 */
 
 		for (i = 0; i < nl2cache; i++) {
 			VERIFY(nvlist_lookup_uint64(l2cache[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 
 			vd = NULL;
 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
 				if (guid ==
 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
 					vd = spa->spa_l2cache.sav_vdevs[j];
 					break;
 				}
 			}
 			ASSERT(vd != NULL);
 
 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
 			    == 0);
 			vdev_get_stats(vd, vs);
 		}
 	}
 }
 
 static void
 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t *features;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	/* We may be unable to read features if pool is suspended. */
 	if (spa_suspended(spa))
 		goto out;
 
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	if (spa->spa_feat_for_write_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_write_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 out:
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 	    features) == 0);
 	nvlist_free(features);
 }
 
 int
 spa_get_stats(const char *name, nvlist_t **config,
     char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
 
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
 		 * This still leaves a window of inconsistency where the spares
 		 * or l2cache devices could change and the config would be
 		 * self-inconsistent.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		if (*config != NULL) {
 			uint64_t loadtimes[2];
 
 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
 			VERIFY(nvlist_add_uint64_array(*config,
 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
 
 			VERIFY(nvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
 			    spa_get_errlog_size(spa)) == 0);
 
 			if (spa_suspended(spa))
 				VERIFY(nvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED,
 				    spa->spa_failmode) == 0);
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
 			spa_add_feature_stats(spa, *config);
 		}
 	}
 
 	/*
 	 * We want to get the alternate root even for faulted pools, so we cheat
 	 * and call spa_lookup() directly.
 	 */
 	if (altroot) {
 		if (spa == NULL) {
 			mutex_enter(&spa_namespace_lock);
 			spa = spa_lookup(name);
 			if (spa)
 				spa_altroot(spa, altroot, buflen);
 			else
 				altroot[0] = '\0';
 			spa = NULL;
 			mutex_exit(&spa_namespace_lock);
 		} else {
 			spa_altroot(spa, altroot, buflen);
 		}
 	}
 
 	if (spa != NULL) {
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Validate that the auxiliary device array is well formed.  We must have an
  * array of nvlists, each which describes a valid leaf vdev.  If this is an
  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
  * specified, as long as they are well-formed.
  */
 static int
 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
     spa_aux_vdev_t *sav, const char *config, uint64_t version,
     vdev_labeltype_t label)
 {
 	nvlist_t **dev;
 	uint_t i, ndev;
 	vdev_t *vd;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * It's acceptable to have no devs specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 		return (0);
 
 	if (ndev == 0)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure the pool is formatted with a version that supports this
 	 * device type.
 	 */
 	if (spa_version(spa) < version)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the pending device list so we correctly handle device in-use
 	 * checking.
 	 */
 	sav->sav_pending = dev;
 	sav->sav_npending = ndev;
 
 	for (i = 0; i < ndev; i++) {
 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 		    mode)) != 0)
 			goto out;
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		/*
 		 * The L2ARC currently only supports disk devices in
 		 * kernel context.  For user-level testing, we allow it.
 		 */
 #ifdef _KERNEL
 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
 			error = SET_ERROR(ENOTBLK);
 			vdev_free(vd);
 			goto out;
 		}
 #endif
 		vd->vdev_top = vd;
 
 		if ((error = vdev_open(vd)) == 0 &&
 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 			    vd->vdev_guid) == 0);
 		}
 
 		vdev_free(vd);
 
 		if (error &&
 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 			goto out;
 		else
 			error = 0;
 	}
 
 out:
 	sav->sav_pending = NULL;
 	sav->sav_npending = 0;
 	return (error);
 }
 
 static int
 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 {
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
 	    VDEV_LABEL_SPARE)) != 0) {
 		return (error);
 	}
 
 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
 	    VDEV_LABEL_L2CACHE));
 }
 
 static void
 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
     const char *config)
 {
 	int i;
 
 	if (sav->sav_config != NULL) {
 		nvlist_t **olddevs;
 		uint_t oldndevs;
 		nvlist_t **newdevs;
 
 		/*
 		 * Generate new dev list by concatentating with the
 		 * current dev list.
 		 */
 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
 		    &olddevs, &oldndevs) == 0);
 
 		newdevs = kmem_alloc(sizeof (void *) *
 		    (ndevs + oldndevs), KM_SLEEP);
 		for (i = 0; i < oldndevs; i++)
 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
 			    KM_SLEEP) == 0);
 		for (i = 0; i < ndevs; i++)
 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
 			    KM_SLEEP) == 0);
 
 		VERIFY(nvlist_remove(sav->sav_config, config,
 		    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 		    config, newdevs, ndevs + oldndevs) == 0);
 		for (i = 0; i < oldndevs + ndevs; i++)
 			nvlist_free(newdevs[i]);
 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
 	} else {
 		/*
 		 * Generate a new dev list.
 		 */
 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
 		    devs, ndevs) == 0);
 	}
 }
 
 /*
  * Stop and drop level 2 ARC devices
  */
 void
 spa_l2cache_drop(spa_t *spa)
 {
 	vdev_t *vd;
 	int i;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		uint64_t pool;
 
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
 	}
 }
 
 /*
  * Pool Creation
  */
 int
 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
 	int error = 0;
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 	uint64_t version, obj;
 	boolean_t has_features;
 
 	/*
 	 * If this pool already exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Allocate a new spa_t structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(pool, NULL, altroot);
 	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	has_features = B_FALSE;
 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
 		if (zpool_prop_feature(nvpair_name(elem)))
 			has_features = B_TRUE;
 	}
 
 	if (has_features || nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
 	}
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Create the root vdev.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 
 	ASSERT(error != 0 || rvd != NULL);
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
 	if (error == 0 && !zfs_allocatable_devs(nvroot))
 		error = SET_ERROR(EINVAL);
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_ashift_optimize(rvd->vdev_child[c]);
 			vdev_metaslab_set_size(rvd->vdev_child[c]);
 			vdev_expand(rvd->vdev_child[c], txg);
 		}
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Get the list of spares, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Get the list of level 2 cache devices, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
 	spa->spa_meta_objset = dp->dp_meta_objset;
 	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
 
 	spa_update_dspace(spa);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Create the pool config object.
 	 */
 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES)
 		spa_feature_create_zap_objects(spa, tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 	    sizeof (uint64_t), 1, &version, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool version");
 	}
 
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
 		if (zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
 			cmn_err(CE_PANIC, "failed to add deflate");
 		}
 	}
 
 	/*
 	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
 	dmu_object_set_compress(spa->spa_meta_objset, obj,
 	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
 	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Create the pool's history object.
 	 */
 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
 		spa_history_create_obj(spa, tx);
 
 	/*
 	 * Generate some random noise for salted checksums to operate on.
 	 */
 	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes));
 
 	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(props, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(spa->spa_dsl_pool);
 
 	/*
 	 * We explicitly wait for the first transaction to complete so that our
 	 * bean counters are appropriately updated.
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	spa_config_sync(spa, B_FALSE, B_TRUE);
 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE);
 
 	spa_history_log_version(spa, "create");
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 #ifdef _KERNEL
 #ifdef illumos
 /*
  * Get the root pool information from the root disk, then import the root pool
  * during the system boot up time.
  */
 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
 
 static nvlist_t *
 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
 {
 	nvlist_t *config;
 	nvlist_t *nvtop, *nvroot;
 	uint64_t pgid;
 
 	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
 		return (NULL);
 
 	/*
 	 * Add this top-level vdev to the child array.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &pgid) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
 
 	/*
 	 * Put this pool's top-level vdevs into a root vdev.
 	 */
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &nvtop, 1) == 0);
 
 	/*
 	 * Replace the existing vdev_tree with the new root vdev in
 	 * this pool's configuration (remove the old, add the new).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
 	return (config);
 }
 
 /*
  * Walk the vdev tree and see if we can find a device with "better"
  * configuration. A configuration is "better" if the label on that
  * device has a more recent txg.
  */
 static void
 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t *label;
 		uint64_t label_txg;
 
 		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
 		    &label) != 0)
 			return;
 
 		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
 		    &label_txg) == 0);
 
 		/*
 		 * Do we have a better boot device?
 		 */
 		if (label_txg > *txg) {
 			*txg = label_txg;
 			*avd = vd;
 		}
 		nvlist_free(label);
 	}
 }
 
 /*
  * Import a root pool.
  *
  * For x86. devpath_list will consist of devid and/or physpath name of
  * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
  * The GRUB "findroot" command will return the vdev we should boot.
  *
  * For Sparc, devpath_list consists the physpath name of the booting device
  * no matter the rootpool is a single device pool or a mirrored pool.
  * e.g.
  *	"/pci@1f,0/ide@d/disk@0,0:a"
  */
 int
 spa_import_rootpool(char *devpath, char *devid)
 {
 	spa_t *spa;
 	vdev_t *rvd, *bvd, *avd = NULL;
 	nvlist_t *config, *nvtop;
 	uint64_t guid, txg;
 	char *pname;
 	int error;
 
 	/*
 	 * Read the label from the boot device and generate a configuration.
 	 */
 	config = spa_generate_rootconf(devpath, devid, &guid);
 #if defined(_OBP) && defined(_KERNEL)
 	if (config == NULL) {
 		if (strstr(devpath, "/iscsi/ssd") != NULL) {
 			/* iscsi boot */
 			get_iscsi_bootpath_phy(devpath);
 			config = spa_generate_rootconf(devpath, devid, &guid);
 		}
 	}
 #endif
 	if (config == NULL) {
 		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
 		    devpath);
 		return (SET_ERROR(EIO));
 	}
 
 	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 	    &pname) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pname)) != NULL) {
 		/*
 		 * Remove the existing root pool from the namespace so that we
 		 * can replace it with the correct config we just read in.
 		 */
 		spa_remove(spa);
 	}
 
 	spa = spa_add(pname, config, NULL);
 	spa->spa_is_root = B_TRUE;
 	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
 	/*
 	 * Build up a vdev tree based on the boot device's label config.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
 	    VDEV_ALLOC_ROOTPOOL);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error) {
 		mutex_exit(&spa_namespace_lock);
 		nvlist_free(config);
 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
 		    pname);
 		return (error);
 	}
 
 	/*
 	 * Get the boot vdev.
 	 */
 	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
 		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
 		    (u_longlong_t)guid);
 		error = SET_ERROR(ENOENT);
 		goto out;
 	}
 
 	/*
 	 * Determine if there is a better boot device.
 	 */
 	avd = bvd;
 	spa_alt_rootvdev(rvd, &avd, &txg);
 	if (avd != bvd) {
 		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
 		    "try booting from '%s'", avd->vdev_path);
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * If the boot device is part of a spare vdev then ensure that
 	 * we're booting off the active spare.
 	 */
 	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    !bvd->vdev_isspare) {
 		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
 		    "try booting from '%s'",
 		    bvd->vdev_parent->
 		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	error = 0;
 out:
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	nvlist_free(config);
 	return (error);
 }
 
 #else	/* !illumos */
 
 extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
     uint64_t *count);
 
 static nvlist_t *
 spa_generate_rootconf(const char *name)
 {
 	nvlist_t **configs, **tops;
 	nvlist_t *config;
 	nvlist_t *best_cfg, *nvtop, *nvroot;
 	uint64_t *holes;
 	uint64_t best_txg;
 	uint64_t nchildren;
 	uint64_t pgid;
 	uint64_t count;
 	uint64_t i;
 	uint_t   nholes;
 
 	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
 		return (NULL);
 
 	ASSERT3U(count, !=, 0);
 	best_txg = 0;
 	for (i = 0; i < count; i++) {
 		uint64_t txg;
 
 		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
 		    &txg) == 0);
 		if (txg > best_txg) {
 			best_txg = txg;
 			best_cfg = configs[i];
 		}
 	}
 
 	nchildren = 1;
 	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
 	holes = NULL;
 	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
 	    &holes, &nholes);
 
 	tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
 	for (i = 0; i < nchildren; i++) {
 		if (i >= count)
 			break;
 		if (configs[i] == NULL)
 			continue;
 		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
 		    &nvtop) == 0);
 		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
 	}
 	for (i = 0; holes != NULL && i < nholes; i++) {
 		if (i >= nchildren)
 			continue;
 		if (tops[holes[i]] != NULL)
 			continue;
 		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
 		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
 		    VDEV_TYPE_HOLE) == 0);
 		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
 		    holes[i]) == 0);
 		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
 		    0) == 0);
 	}
 	for (i = 0; i < nchildren; i++) {
 		if (tops[i] != NULL)
 			continue;
 		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
 		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
 		    VDEV_TYPE_MISSING) == 0);
 		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
 		    i) == 0);
 		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
 		    0) == 0);
 	}
 
 	/*
 	 * Create pool config based on the best vdev config.
 	 */
 	nvlist_dup(best_cfg, &config, KM_SLEEP);
 
 	/*
 	 * Put this pool's top-level vdevs into a root vdev.
 	 */
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &pgid) == 0);
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    tops, nchildren) == 0);
 
 	/*
 	 * Replace the existing vdev_tree with the new root vdev in
 	 * this pool's configuration (remove the old, add the new).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 
 	/*
 	 * Drop vdev config elements that should not be present at pool level.
 	 */
 	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
 	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
 
 	for (i = 0; i < count; i++)
 		nvlist_free(configs[i]);
 	kmem_free(configs, count * sizeof(void *));
 	for (i = 0; i < nchildren; i++)
 		nvlist_free(tops[i]);
 	kmem_free(tops, nchildren * sizeof(void *));
 	nvlist_free(nvroot);
 	return (config);
 }
 
 int
 spa_import_rootpool(const char *name)
 {
 	spa_t *spa;
 	vdev_t *rvd, *bvd, *avd = NULL;
 	nvlist_t *config, *nvtop;
 	uint64_t txg;
 	char *pname;
 	int error;
 
 	/*
 	 * Read the label from the boot device and generate a configuration.
 	 */
 	config = spa_generate_rootconf(name);
 
 	mutex_enter(&spa_namespace_lock);
 	if (config != NULL) {
 		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    &pname) == 0 && strcmp(name, pname) == 0);
 		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
 		    == 0);
 
 		if ((spa = spa_lookup(pname)) != NULL) {
 			/*
 			 * Remove the existing root pool from the namespace so
 			 * that we can replace it with the correct config
 			 * we just read in.
 			 */
 			spa_remove(spa);
 		}
 		spa = spa_add(pname, config, NULL);
 
 		/*
 		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
 		 * via spa_version().
 		 */
 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 		    &spa->spa_ubsync.ub_version) != 0)
 			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 	} else if ((spa = spa_lookup(name)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		nvlist_free(config);
 		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
 		    name);
 		return (EIO);
 	} else {
 		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
 	}
 	spa->spa_is_root = B_TRUE;
 	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
 	/*
 	 * Build up a vdev tree based on the boot device's label config.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
 	    VDEV_ALLOC_ROOTPOOL);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error) {
 		mutex_exit(&spa_namespace_lock);
 		nvlist_free(config);
 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
 		    pname);
 		return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	nvlist_free(config);
 	return (0);
 }
 
 #endif	/* illumos */
 #endif	/* _KERNEL */
 
 /*
  * Import a non-root pool into the system.
  */
 int
 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_rewind_policy_t policy;
 	uint64_t mode = spa_mode_global;
 	uint64_t readonly = B_FALSE;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	/*
 	 * If a pool with this name exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	(void) nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
 	if (readonly)
 		mode = FREAD;
 	spa = spa_add(pool, config, altroot);
 	spa->spa_import_flags = flags;
 
 	/*
 	 * Verbatim import - Take a pool and insert it into the namespace
 	 * as if it had been loaded at boot.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
 		if (props != NULL)
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 		spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
 
 		mutex_exit(&spa_namespace_lock);
 		return (0);
 	}
 
 	spa_activate(spa, mode);
 
 	/*
 	 * Don't start async tasks until we know everything is healthy.
 	 */
 	spa_async_suspend(spa);
 
 	zpool_get_rewind_policy(config, &policy);
 	if (policy.zrp_request & ZPOOL_DO_REWIND)
 		state = SPA_LOAD_RECOVER;
 
 	/*
 	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
 	 * because the user-supplied config is actually the one to trust when
 	 * doing an import.
 	 */
 	if (state != SPA_LOAD_RECOVER)
 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 
 	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
 	    policy.zrp_request);
 
 	/*
 	 * Propagate anything learned while loading the pool and pass it
 	 * back to caller (i.e. rewind info, missing devices, etc).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 	    spa->spa_load_info) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
 	 * Toss any existing sparelist, as it doesn't have any validity
 	 * anymore, and conflicts with spa_has_spare().
 	 */
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 		spa_load_spares(spa);
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 		spa_load_l2cache(spa);
 	}
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 	if (error == 0)
 		error = spa_validate_aux(spa, nvroot, -1ULL,
 		    VDEV_ALLOC_SPARE);
 	if (error == 0)
 		error = spa_validate_aux(spa, nvroot, -1ULL,
 		    VDEV_ALLOC_L2CACHE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
 	if (error != 0 || (props && spa_writeable(spa) &&
 	    (error = spa_prop_set(spa, props)))) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	spa_async_resume(spa);
 
 	/*
 	 * Override any spares and level 2 cache devices as specified by
 	 * the user, as these may have correct device names/devids, etc.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		if (spa->spa_spares.sav_config)
 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		if (spa->spa_l2cache.sav_config)
 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Check for any removed devices.
 	 */
 	if (spa->spa_autoreplace) {
 		spa_aux_check_removed(&spa->spa_spares);
 		spa_aux_check_removed(&spa->spa_l2cache);
 	}
 
 	if (spa_writeable(spa)) {
 		/*
 		 * Update the config cache to include the newly-imported pool.
 		 */
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	}
 
 	/*
 	 * It's possible that the pool was expanded while it was exported.
 	 * We kick off an async task to handle this for us.
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
 	spa_history_log_version(spa, "import");
 
 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
 
 	mutex_exit(&spa_namespace_lock);
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	zvol_create_minors(pool);
 #endif
 #endif
 	return (0);
 }
 
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
 	char *poolname;
 	spa_t *spa;
 	uint64_t state;
 	int error;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
 
 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 		return (NULL);
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 	spa_activate(spa, FREAD);
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
 	 * Pass TRUE for mosconfig because the user-supplied config
 	 * is actually the one to trust when doing an import.
 	 */
 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    poolname) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    state) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp) == 0);
 		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 
 		/*
 		 * If the bootfs property exists on this pool then we
 		 * copy it out so that external consumers can tell which
 		 * pools are bootable.
 		 */
 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 			/*
 			 * We have to play games with the name since the
 			 * pool was opened as TRYIMPORT_NAME.
 			 */
 			if (dsl_dsobj_to_dsname(spa_name(spa),
 			    spa->spa_bootfs, tmpname) == 0) {
 				char *cp;
 				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 				cp = strchr(tmpname, '/');
 				if (cp == NULL) {
 					(void) strlcpy(dsname, tmpname,
 					    MAXPATHLEN);
 				} else {
 					(void) snprintf(dsname, MAXPATHLEN,
 					    "%s/%s", poolname, ++cp);
 				}
 				VERIFY(nvlist_add_string(config,
 				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
 				kmem_free(dsname, MAXPATHLEN);
 			}
 			kmem_free(tmpname, MAXPATHLEN);
 		}
 
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_remove(spa);
 	mutex_exit(&spa_namespace_lock);
 
 	return (config);
 }
 
 /*
  * Pool export/destroy
  *
  * The act of destroying or exporting a pool is very simple.  We make sure there
  * is no more pending I/O and any references to the pool are gone.  Then, we
  * update the pool state and sync all the labels to disk, removing the
  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
  * we don't sync the labels or remove the configuration cache.
  */
 static int
 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
     boolean_t force, boolean_t hardforce)
 {
 	spa_t *spa;
 
 	if (oldconfig)
 		*oldconfig = NULL;
 
 	if (!(spa_mode_global & FWRITE))
 		return (SET_ERROR(EROFS));
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pool)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	/*
 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
 	 * reacquire the namespace lock, and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	spa_async_suspend(spa);
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 
 	/*
 	 * The pool will be in core if it's openable,
 	 * in which case we can modify its state.
 	 */
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
 		/*
 		 * Objsets may be open only because they're dirty, so we
 		 * have to force it to sync before checking spa_refcnt.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 		spa_evicting_os_wait(spa);
 
 		/*
 		 * A pool cannot be exported or destroyed if there are active
 		 * references.  If we are resetting a pool, allow references by
 		 * fault injection handlers.
 		 */
 		if (!spa_refcount_zero(spa) ||
 		    (spa->spa_inject_ref != 0 &&
 		    new_state != POOL_STATE_UNINITIALIZED)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(EBUSY));
 		}
 
 		/*
 		 * A pool cannot be exported if it has an active shared spare.
 		 * This is to prevent other pools stealing the active spare
 		 * from an exported pool. At user's own will, such pool can
 		 * be forcedly exported.
 		 */
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(EXDEV));
 		}
 
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			spa->spa_final_txg = spa_last_synced_txg(spa) +
 			    TXG_DEFER_SIZE + 1;
 			vdev_config_dirty(spa->spa_root_vdev);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 	}
 
 	if (oldconfig && spa->spa_config)
 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
 
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
 			spa_config_sync(spa, B_TRUE, B_TRUE);
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Destroy a storage pool.
  */
 int
 spa_destroy(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce)
 {
 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
 	    force, hardforce));
 }
 
 /*
  * Similar to spa_export(), this unloads the spa_t without actually removing it
  * from the namespace in any way.
  */
 int
 spa_reset(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
  */
 
 /*
  * Add a device to a storage pool.
  */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg, id;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
 	    &nspares) != 0)
 		nspares = 0;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
 	    &nl2cache) != 0)
 		nl2cache = 0;
 
 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 	if (vd->vdev_children != 0 &&
 	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * We must validate the spares and l2cache devices after checking the
 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 	 */
 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * Transfer each new top-level vdev from vd to rvd.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 
 		/*
 		 * Set the vdev id to the first hole, if one exists.
 		 */
 		for (id = 0; id < rvd->vdev_children; id++) {
 			if (rvd->vdev_child[id]->vdev_ishole) {
 				vdev_free(rvd->vdev_child[id]);
 				break;
 			}
 		}
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = id;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (nspares != 0) {
 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
 		    ZPOOL_CONFIG_SPARES);
 		spa_load_spares(spa);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	if (nl2cache != 0) {
 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
 		    ZPOOL_CONFIG_L2CACHE);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * We have to be careful when adding new vdevs to an existing pool.
 	 * If other threads start allocating from these vdevs before we
 	 * sync the config cache, and we lose power, then upon reboot we may
 	 * fail to open the pool because there are DVAs that the config cache
 	 * can't translate.  Therefore, we first add the vdevs without
 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
 	 * and then let spa_config_update() initialize the new metaslabs.
 	 *
 	 * spa_load() checks for added-but-not-initialized vdevs, so that
 	 * if we lose power at any point in this sequence, the remaining
 	 * steps will be completed the next time we load the pool.
 	 */
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Attach a device to a mirror.  The arguments are the path to any device
  * in the mirror, and the nvroot for the new device.  If the path specifies
  * a device that is not mirrored, we automatically insert the mirror vdev.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
  * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
  */
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
 	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare;
 	int error;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!oldvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ATTACH)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
 
 	if (!newvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
 	 * Spares can't replace logs
 	 */
 	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 	if (!replacing) {
 		/*
 		 * For attach, the only allowable parent is a mirror or the root
 		 * vdev.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
 		    pvd->vdev_ops != &vdev_root_ops)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		pvops = &vdev_mirror_ops;
 	} else {
 		/*
 		 * Active hot spares can only be replaced by inactive hot
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
 		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * If the source is a hot spare, and the parent isn't already a
 		 * spare, then we want to create a new hot spare.  Otherwise, we
 		 * want to create a replacing vdev.  The user is not allowed to
 		 * attach to a spared vdev child unless the 'isspare' state is
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
 		if (pvd->vdev_ops == &vdev_replacing_ops &&
 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 
 		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
 	}
 
 	/*
 	 * Make sure the new device is big enough.
 	 */
 	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
 
 	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
 		spa_strfree(oldvd->vdev_path);
 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
 		    KM_SLEEP);
 		(void) sprintf(oldvd->vdev_path, "%s/%s",
 		    newvd->vdev_path, "old");
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
 	}
 
 	/* mark the device being resilvered */
 	newvd->vdev_resilver_txg = txg;
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
 	if (pvd->vdev_ops != pvops)
 		pvd = vdev_add_parent(oldvd, pvops);
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
 	ASSERT(pvd->vdev_ops == pvops);
 	ASSERT(oldvd->vdev_parent == pvd);
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
 	 * for any dmu_sync-ed blocks.  It will propagate upward when
 	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
 	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
 	    dtl_max_txg - TXG_INITIAL);
 
 	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
 		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
 	}
 
 	oldvdpath = spa_strdup(oldvd->vdev_path);
 	newvdpath = spa_strdup(newvd->vdev_path);
 	newvd_isspare = newvd->vdev_isspare;
 
 	/*
 	 * Mark newvd's DTL dirty in this txg.
 	 */
 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 	/*
 	 * Schedule the resilver to restart in the future. We do this to
 	 * ensure that dmu_sync-ed blocks have been stitched into the
 	 * respective datasets.
 	 */
 	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 
 	spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH);
 
 	/*
 	 * Commit the config
 	 */
 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
 	spa_history_log_internal(spa, "vdev attach", NULL,
 	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
 
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
 	return (0);
 }
 
 /*
  * Detach a device from a mirror or replacing vdev.
  *
  * If 'replace_done' is specified, only detach if the parent
  * is a replacing vdev.
  */
 int
 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid = 0;
 	char *vdpath;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = vd->vdev_parent;
 
 	/*
 	 * If the parent/child relationship is not as expected, don't do it.
 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
 	 * vdev that's replacing B with C.  The user's intent in replacing
 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
 	 * the replace by detaching C, the expected behavior is to end up
 	 * M(A,B).  But suppose that right after deciding to detach C,
 	 * the replacement of B completes.  We would have M(A,C), and then
 	 * ask to detach C, which would leave us with just A -- not what
 	 * the user wanted.  To prevent this, we make sure that the
 	 * parent/child relationship hasn't changed -- in this example,
 	 * that C's parent is still the replacing vdev R.
 	 */
 	if (pvd->vdev_guid != pguid && pguid != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
 	 * Only 'replacing' or 'spare' vdevs can be replaced.
 	 */
 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
 
 	/*
 	 * Only mirror, replacing, and spare vdevs support detach.
 	 */
 	if (pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
 	 * If this device has the only valid copy of some data,
 	 * we cannot safely detach it.
 	 */
 	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
 	    vd->vdev_path != NULL) {
 		size_t len = strlen(vd->vdev_path);
 
 		for (int c = 0; c < pvd->vdev_children; c++) {
 			cvd = pvd->vdev_child[c];
 
 			if (cvd == vd || cvd->vdev_path == NULL)
 				continue;
 
 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
 				spa_strfree(cvd->vdev_path);
 				cvd->vdev_path = spa_strdup(vd->vdev_path);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we are detaching the original disk from a spare, then it implies
 	 * that the spare should become a real disk, and be removed from the
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_id == 0 &&
 	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
 	 * Erase the disk labels so the disk can be used for other things.
 	 * This must be done after all other error cases are handled,
 	 * but before we disembowel vd (so we can still do I/O to it).
 	 * But if we can't do it, don't treat the error as fatal --
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
 	 */
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
 	 * do it now, marking the vdev as no longer a spare in the process.
 	 * We must do this before vdev_remove_parent(), because that can
 	 * change the GUID if it creates a new toplevel GUID.  For a similar
 	 * reason, we must remove the spare now, in the same txg as the detach;
 	 * otherwise someone could attach a new sibling, change the GUID, and
 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
 	if (pvd->vdev_children == 1) {
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
 	}
 
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
 	 * may have been the previous top-level vdev.
 	 */
 	tvd = cvd->vdev_top;
 	ASSERT(tvd->vdev_parent == rvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(cvd);
 
 	/*
 	 * If the 'autoexpand' property is set on the pool then automatically
 	 * try to expand the size of the pool. For example if the device we
 	 * just detached was smaller than the others, it may be possible to
 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
 	 * first so that we can obtain the updated sizes of the leaf vdevs.
 	 */
 	if (spa->spa_autoexpand) {
 		vdev_reopen(tvd);
 		vdev_expand(tvd, txg);
 	}
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
 	vdpath = spa_strdup(vd->vdev_path);
 	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
 
 	/* hang on to the spa before we release the lock */
 	spa_open_ref(spa, FTAG);
 
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_history_log_internal(spa, "detach", NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
 	/*
 	 * If this was the removal of the original device in a hot spare vdev,
 	 * then we want to go through and remove the device from the hot spare
 	 * list of every other pool.
 	 */
 	if (unspare) {
 		spa_t *altspa = NULL;
 
 		mutex_enter(&spa_namespace_lock);
 		while ((altspa = spa_next(altspa)) != NULL) {
 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
 			    altspa == spa)
 				continue;
 
 			spa_open_ref(altspa, FTAG);
 			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			mutex_enter(&spa_namespace_lock);
 			spa_close(altspa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
 
 		/* search the rest of the vdevs for spares to remove */
 		spa_vdev_resilver_done(spa);
 	}
 
 	/* all done with the spa; OK to release */
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
 
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
 int
 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp)
 {
 	int error = 0;
 	uint64_t txg, *glist;
 	spa_t *newspa;
 	uint_t c, children, lastlog;
 	nvlist_t **child, *nvl, *tmp;
 	dmu_tx_t *tx;
 	char *altroot = NULL;
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 	error = spa_offline_log(spa);
 	txg = spa_vdev_config_enter(spa);
 
 	if (activate_slog)
 		spa_activate_log(spa);
 
 	if (error != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	/* check new spa name before going any further */
 	if (spa_lookup(newname) != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
 
 	/*
 	 * scan through all the children to ensure they're all mirrors
 	 */
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* first, check to ensure we've got the right child count */
 	rvd = spa->spa_root_vdev;
 	lastlog = 0;
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		/* don't count the holes & logs as children */
 		if (vd->vdev_islog || vd->vdev_ishole) {
 			if (lastlog == 0)
 				lastlog = c;
 			continue;
 		}
 
 		lastlog = 0;
 	}
 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* next, ensure no spare or cache devices are part of the split */
 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
 
 	/* then, loop over each vdev and validate it */
 	for (c = 0; c < children; c++) {
 		uint64_t is_hole = 0;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 		    &is_hole);
 
 		if (is_hole != 0) {
 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 				continue;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 		}
 
 		/* which disk is going to be split? */
 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 		    &glist[c]) != 0) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		/* look it up in the spa */
 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 		if (vml[c] == NULL) {
 			error = SET_ERROR(ENODEV);
 			break;
 		}
 
 		/* make sure there's nothing stopping the split */
 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
 		    vml[c]->vdev_islog ||
 		    vml[c]->vdev_ishole ||
 		    vml[c]->vdev_isspare ||
 		    vml[c]->vdev_isl2cache ||
 		    !vdev_writeable(vml[c]) ||
 		    vml[c]->vdev_children != 0 ||
 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (vdev_dtl_required(vml[c])) {
 			error = SET_ERROR(EBUSY);
 			break;
 		}
 
 		/* we need certain info from the top level */
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vml[c]->vdev_top->vdev_ms_array) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vml[c]->vdev_top->vdev_ms_shift) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
 		    vml[c]->vdev_top->vdev_asize) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 		    vml[c]->vdev_top->vdev_ashift) == 0);
 	}
 
 	if (error != 0) {
 		kmem_free(vml, children * sizeof (vdev_t *));
 		kmem_free(glist, children * sizeof (uint64_t));
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* stop writers from using the disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_TRUE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	/*
 	 * Temporarily record the splitting vdevs in the spa config.  This
 	 * will disappear once the config is regenerated.
 	 */
 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    glist, children) == 0);
 	kmem_free(glist, children * sizeof (uint64_t));
 
 	mutex_enter(&spa->spa_props_lock);
 	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
 	    nvl) == 0);
 	mutex_exit(&spa->spa_props_lock);
 	spa->spa_config_splitting = nvl;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	/* configure and create the new pool */
 	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 	    spa_version(spa)) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    spa->spa_config_txg) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_generate_guid(NULL)) == 0);
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
 	/* add the new pool to the namespace */
 	newspa = spa_add(newname, config, altroot);
 	newspa->spa_config_txg = spa->spa_config_txg;
 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
 	/* release the spa config lock, retaining the namespace lock */
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 1);
 
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
 #ifndef illumos
 	/* mark that we are creating new spa by splitting */
 	newspa->spa_splitting_newspa = B_TRUE;
 #endif
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
 #ifndef illumos
 	newspa->spa_splitting_newspa = B_FALSE;
 #endif
 	if (error)
 		goto out;
 
 	/* if that worked, generate a real config for the new pool */
 	if (newspa->spa_root_vdev != NULL) {
 		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
 		    B_TRUE));
 	}
 
 	/* set the props */
 	if (props != NULL) {
 		spa_configfile_set(newspa, props, B_FALSE);
 		error = spa_prop_set(newspa, props);
 		if (error)
 			goto out;
 	}
 
 	/* flush everything */
 	txg = spa_vdev_config_enter(newspa);
 	vdev_config_dirty(newspa->spa_root_vdev);
 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 2);
 
 	spa_async_resume(newspa);
 
 	/* finally, update the original pool's config */
 	txg = spa_vdev_config_enter(spa);
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0)
 		dmu_tx_abort(tx);
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL) {
 			vdev_split(vml[c]);
 			if (error == 0)
 				spa_history_log_internal(spa, "detach", tx,
 				    "vdev=%s", vml[c]->vdev_path);
 			vdev_free(vml[c]);
 		}
 	}
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa->spa_config_splitting = NULL;
 	nvlist_free(nvl);
 	if (error == 0)
 		dmu_tx_commit(tx);
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
 	spa_history_log_internal(newspa, "split", NULL,
 	    "from pool %s", spa_name(spa));
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 
 	/* if we're not going to mount the filesystems in userland, export */
 	if (exp)
 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
 		    B_FALSE, B_FALSE);
 
 	return (error);
 
 out:
 	spa_unload(newspa);
 	spa_deactivate(newspa);
 	spa_remove(newspa);
 
 	txg = spa_vdev_config_enter(spa);
 
 	/* re-online all offlined disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
 	spa->spa_config_splitting = NULL;
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 	return (error);
 }
 
 static nvlist_t *
 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
 {
 	for (int i = 0; i < count; i++) {
 		uint64_t guid;
 
 		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
 		    &guid) == 0);
 
 		if (guid == target_guid)
 			return (nvpp[i]);
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
 	nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
 	if (count > 1)
 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
 
 	for (int i = 0, j = 0; i < count; i++) {
 		if (dev[i] == dev_to_remove)
 			continue;
 		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
 	}
 
 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
 	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
 
 	for (int i = 0; i < count - 1; i++)
 		nvlist_free(newdev[i]);
 
 	if (count > 1)
 		kmem_free(newdev, (count - 1) * sizeof (void *));
 }
 
 /*
  * Evacuate the device.
  */
 static int
 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 {
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Evacuate the device.  We don't hold the config lock as writer
 	 * since we need to do I/O but we do keep the
 	 * spa_namespace_lock held.  Once this completes the device
 	 * should no longer have any blocks allocated on it.
 	 */
 	if (vd->vdev_islog) {
 		if (vd->vdev_stat.vs_alloc != 0)
 			error = spa_offline_log(spa);
 	} else {
 		error = SET_ERROR(ENOTSUP);
 	}
 
 	if (error)
 		return (error);
 
 	/*
 	 * The evacuation succeeded.  Remove any remaining MOS metadata
 	 * associated with this vdev, and wait for these changes to sync.
 	 */
 	ASSERT0(vd->vdev_stat.vs_alloc);
 	txg = spa_vdev_config_enter(spa);
 	vd->vdev_removing = B_TRUE;
 	vdev_dirty_leaves(vd, VDD_DTL, txg);
 	vdev_config_dirty(vd);
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	return (0);
 }
 
 /*
  * Complete the removal by cleaning up the namespace.
  */
 static void
 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t id = vd->vdev_id;
 	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Only remove any devices which are empty.
 	 */
 	if (vd->vdev_stat.vs_alloc != 0)
 		return;
 
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	if (list_link_active(&vd->vdev_state_dirty_node))
 		vdev_state_clean(vd);
 	if (list_link_active(&vd->vdev_config_dirty_node))
 		vdev_config_clean(vd);
 
 	vdev_free(vd);
 
 	if (last_vdev) {
 		vdev_compact_children(rvd);
 	} else {
 		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
 		vdev_add_child(rvd, vd);
 	}
 	vdev_config_dirty(rvd);
 
 	/*
 	 * Reassess the health of our root vdev.
 	 */
 	vdev_reopen(rvd);
 }
 
 /*
  * Remove a device from the pool -
  *
  * Removing a device from the vdev namespace requires several steps
  * and can take a significant amount of time.  As a result we use
  * the spa_vdev_config_[enter/exit] functions which allow us to
  * grab and release the spa_config_lock while still holding the namespace
  * lock.  During each step the configuration is synced out.
  *
  * Currently, this supports removing only hot spares, slogs, and level 2 ARC
  * devices.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
 	metaslab_group_t *mg;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
 	int error = 0;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
 	ASSERT(spa_writeable(spa));
 
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (spa->spa_spares.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
 		/*
 		 * Only remove the hot spare if it's not currently in use
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 			spa_load_spares(spa);
 			spa->spa_spares.sav_sync = B_TRUE;
 		} else {
 			error = SET_ERROR(EBUSY);
 		}
 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
 		/*
 		 * Cache devices can always be removed.
 		 */
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	} else if (vd != NULL && vd->vdev_islog) {
 		ASSERT(!locked);
 		ASSERT(vd == vd->vdev_top);
 
 		mg = vd->vdev_mg;
 
 		/*
 		 * Stop allocating from this vdev.
 		 */
 		metaslab_group_passivate(mg);
 
 		/*
 		 * Wait for the youngest allocations and frees to sync,
 		 * and then wait for the deferral of those frees to finish.
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 		/*
 		 * Attempt to evacuate the vdev.
 		 */
 		error = spa_vdev_remove_evacuate(spa, vd);
 
 		txg = spa_vdev_config_enter(spa);
 
 		/*
 		 * If we couldn't evacuate the vdev, unwind.
 		 */
 		if (error) {
 			metaslab_group_activate(mg);
 			return (spa_vdev_exit(spa, NULL, txg, error));
 		}
 
 		/*
 		 * Clean up the vdev namespace.
 		 */
 		spa_vdev_remove_from_namespace(spa, vd);
 
 	} else if (vd != NULL) {
 		/*
 		 * Normal vdevs cannot be removed (yet).
 		 */
 		error = SET_ERROR(ENOTSUP);
 	} else {
 		/*
 		 * There is no vdev of any kind with the specified guid.
 		 */
 		error = SET_ERROR(ENOENT);
 	}
 
 	if (!locked)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	return (error);
 }
 
 /*
  * Find any device that's done replacing, or a vdev marked 'unspare' that's
  * currently spared, so we can detach it.
  */
 static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed replacement.  We always consider the first
 	 * vdev in the list to be the oldest vdev, and the last one to be
 	 * the newest (see spa_vdev_attach() for how that works).  In
 	 * the case where the newest vdev is faulted, we will not automatically
 	 * remove it after a resilver completes.  This is OK as it will require
 	 * user intervention to determine which disk the admin wishes to keep.
 	 */
 	if (vd->vdev_ops == &vdev_replacing_ops) {
 		ASSERT(vd->vdev_children > 1);
 
 		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 */
 	if (vd->vdev_ops == &vdev_spare_ops) {
 		vdev_t *first = vd->vdev_child[0];
 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
 
 		if (last->vdev_unspare) {
 			oldvd = first;
 			newvd = last;
 		} else if (first->vdev_unspare) {
 			oldvd = last;
 			newvd = first;
 		} else {
 			oldvd = NULL;
 		}
 
 		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 
 		/*
 		 * If there are more than two spares attached to a disk,
 		 * and those spares are not required, then we want to
 		 * attempt to free them up now so that they can be used
 		 * by other pools.  Once we're back down to a single
 		 * disk+spare, we stop removing them.
 		 */
 		if (vd->vdev_children > 2) {
 			newvd = vd->vdev_child[1];
 
 			if (newvd->vdev_isspare && last->vdev_isspare &&
 			    vdev_dtl_empty(last, DTL_MISSING) &&
 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
 			    !vdev_dtl_required(newvd))
 				return (newvd);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd, *pvd, *ppvd;
 	uint64_t guid, sguid, pguid, ppguid;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		pvd = vd->vdev_parent;
 		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
 		pguid = pvd->vdev_guid;
 		ppguid = ppvd->vdev_guid;
 		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
 		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
 		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
 
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * Update the stored path or FRU for this vdev.
  */
 int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
 	vdev_t *vd;
 	boolean_t sync = B_FALSE;
 
 	ASSERT(spa_writeable(spa));
 
 	spa_vdev_state_enter(spa, SCL_ALL);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	if (ispath) {
 		if (strcmp(value, vd->vdev_path) != 0) {
 			spa_strfree(vd->vdev_path);
 			vd->vdev_path = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	} else {
 		if (vd->vdev_fru == NULL) {
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		} else if (strcmp(value, vd->vdev_fru) != 0) {
 			spa_strfree(vd->vdev_fru);
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	}
 
 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 {
 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
 }
 
 int
 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 {
 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * SPA Scanning
  * ==========================================================================
  */
 
 int
 spa_scan_stop(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
 int
 spa_scan(spa_t *spa, pool_scan_func_t func)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
 	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
 	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
 /*
  * ==========================================================================
  * SPA async task processing
  * ==========================================================================
  */
 
 static void
 spa_async_remove(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = B_FALSE;
 		vd->vdev_delayed_close = B_FALSE;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 
 		/*
 		 * We want to clear the stats, but we don't want to do a full
 		 * vdev_clear() as that will cause us to throw away
 		 * degraded/faulted state as well as attempt to reopen the
 		 * device, all of which is a waste.
 		 */
 		vd->vdev_stat.vs_read_errors = 0;
 		vd->vdev_stat.vs_write_errors = 0;
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
 		/* Tell userspace that the vdev is gone. */
 		zfs_post_remove(spa, vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_remove(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_probe(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_probe_wanted) {
 		vd->vdev_probe_wanted = B_FALSE;
 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_probe(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 {
 	sysevent_id_t eid;
 	nvlist_t *attr;
 	char *physpath;
 
 	if (!spa->spa_autoexpand)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		spa_async_autoexpand(spa, cvd);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
 		return;
 
 	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
 
 	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 
 	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 	    ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
 
 	nvlist_free(attr);
 	kmem_free(physpath, MAXPATHLEN);
 }
 
 static void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = arg;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 		uint64_t old_space, new_space;
 
 		mutex_enter(&spa_namespace_lock);
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
 		 * If the pool grew as a result of the config update,
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
 			spa_history_log_internal(spa, "vdev online", NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), new_space, new_space - old_space);
 		}
 	}
 
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	/*
 	 * See if any devices need to be probed.
 	 */
 	if (tasks & SPA_ASYNC_PROBE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_probe(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER_DONE)
 		spa_vdev_resilver_done(spa);
 
 	/*
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER)
 		dsl_resilver_restart(spa->spa_dsl_pool, 0);
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_thread = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 static void
 spa_async_thread_vd(void *arg)
 {
 	spa_t *spa = arg;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 retry:
 	spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & SPA_ASYNC_REMOVE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	if ((tasks & SPA_ASYNC_REMOVE) != 0)
 		goto retry;
 	spa->spa_async_thread_vd = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
 	while (spa->spa_async_thread != NULL &&
 	    spa->spa_async_thread_vd != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	ASSERT(spa->spa_async_suspended != 0);
 	spa->spa_async_suspended--;
 	mutex_exit(&spa->spa_async_lock);
 }
 
 static boolean_t
 spa_async_tasks_pending(spa_t *spa)
 {
 	uint_t non_config_tasks;
 	uint_t config_task;
 	boolean_t config_task_suspended;
 
 	non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
 	    SPA_ASYNC_REMOVE);
 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
 	if (spa->spa_ccw_fail_time == 0) {
 		config_task_suspended = B_FALSE;
 	} else {
 		config_task_suspended =
 		    (gethrtime() - spa->spa_ccw_fail_time) <
 		    (zfs_ccw_retry_interval * NANOSEC);
 	}
 
 	return (non_config_tasks || (config_task && !config_task_suspended));
 }
 
 static void
 spa_async_dispatch(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if (spa_async_tasks_pending(spa) &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread == NULL &&
 	    rootdir != NULL)
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 static void
 spa_async_dispatch_vd(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread_vd == NULL &&
 	    rootdir != NULL)
 		spa->spa_async_thread_vd = thread_create(NULL, 0,
 		    spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_request(spa_t *spa, int task)
 {
 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
 	spa_async_dispatch_vd(spa);
 }
 
 /*
  * ==========================================================================
  * SPA syncing routines
  * ==========================================================================
  */
 
 static int
 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	bpobj_t *bpo = arg;
 	bpobj_enqueue(bpo, bp, tx);
 	return (0);
 }
 
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zio_t *zio = arg;
 
 	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
 	    BP_GET_PSIZE(bp), zio->io_flags));
 	return (0);
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing frees.
  */
 static void
 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
 	VERIFY(zio_wait(zio) == 0);
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing deferred frees.
  */
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 	    spa_free_sync_cb, zio, tx), ==, 0);
 	VERIFY0(zio_wait(zio));
 }
 
 
 static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
 	size_t bufsize;
 	size_t nvsize = 0;
 	dmu_buf_t *db;
 
 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
 	 * information.  This avoids the dmu_buf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = kmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
 	bzero(packed + nvsize, bufsize - nvsize);
 
 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 
 	kmem_free(packed, bufsize);
 
 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
 	dmu_buf_rele(db, FTAG);
 }
 
 static void
 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
     const char *config, const char *entry)
 {
 	nvlist_t *nvroot;
 	nvlist_t **list;
 	int i;
 
 	if (!sav->sav_sync)
 		return;
 
 	/*
 	 * Update the MOS nvlist describing the list of available devices.
 	 * spa_validate_aux() will have already made sure this nvlist is
 	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (sav->sav_object == 0) {
 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
 		    sizeof (uint64_t), tx);
 		VERIFY(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
 		    &sav->sav_object, tx) == 0);
 	}
 
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	if (sav->sav_count == 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
 	} else {
 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
 		    sav->sav_count) == 0);
 		for (i = 0; i < sav->sav_count; i++)
 			nvlist_free(list[i]);
 		kmem_free(list, sav->sav_count * sizeof (void *));
 	}
 
 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 	nvlist_free(nvroot);
 
 	sav->sav_sync = B_FALSE;
 }
 
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
 	if (list_is_empty(&spa->spa_config_dirty_list))
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
 	/*
 	 * If we're upgrading the spa version then make sure that
 	 * the config object gets updated with the correct version.
 	 */
 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 		    spa->spa_uberblock.ub_version);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (spa->spa_config_syncing)
 		nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
 static void
 spa_sync_version(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *versionp = arg;
 	uint64_t version = *versionp;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	/*
 	 * Setting the version is special cased when first creating the pool.
 	 */
 	ASSERT(tx->tx_txg != TXG_INITIAL);
 
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	ASSERT(version >= spa_version(spa));
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_history_log_internal(spa, "set", tx, "version=%lld", version);
 }
 
 /*
  * Set zpool properties.
  */
 static void
 spa_sync_props(void *arg, dmu_tx_t *tx)
 {
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
 		uint64_t intval;
 		char *strval, *fname;
 		zpool_prop_t prop;
 		const char *propname;
 		zprop_type_t proptype;
 		spa_feature_t fid;
 
 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
 		case ZPROP_INVAL:
 			/*
 			 * We checked this earlier in spa_prop_validate().
 			 */
 			ASSERT(zpool_prop_feature(nvpair_name(elem)));
 
 			fname = strchr(nvpair_name(elem), '@') + 1;
 			VERIFY0(zfeature_lookup_name(fname, &fid));
 
 			spa_feature_enable(spa, fid, tx);
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=enabled", nvpair_name(elem));
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			intval = fnvpair_value_uint64(elem);
 			/*
 			 * The version is synced seperatly before other
 			 * properties and should be correct by now.
 			 */
 			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
 			/*
 			 * 'altroot' is a non-persistent property. It should
 			 * have been set temporarily at creation or import time.
 			 */
 			ASSERT(spa->spa_root != NULL);
 			break;
 
 		case ZPOOL_PROP_READONLY:
 		case ZPOOL_PROP_CACHEFILE:
 			/*
 			 * 'readonly' and 'cachefile' are also non-persisitent
 			 * properties.
 			 */
 			break;
 		case ZPOOL_PROP_COMMENT:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_comment != NULL)
 				spa_strfree(spa->spa_comment);
 			spa->spa_comment = spa_strdup(strval);
 			/*
 			 * We need to dirty the configuration on all the vdevs
 			 * so that their labels get updated.  It's unnecessary
 			 * to do this for pool creation since the vdev's
 			 * configuratoin has already been dirtied.
 			 */
 			if (tx->tx_txg != TXG_INITIAL)
 				vdev_config_dirty(spa->spa_root_vdev);
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", nvpair_name(elem), strval);
 			break;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
 				spa->spa_pool_props_object =
 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
 				    tx);
 			}
 
 			/* normalize the property name */
 			propname = zpool_prop_to_name(prop);
 			proptype = zpool_prop_get_type(prop);
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%s", nvpair_name(elem), strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(zpool_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    8, 1, &intval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%lld", nvpair_name(elem), intval);
 			} else {
 				ASSERT(0); /* not allowed */
 			}
 
 			switch (prop) {
 			case ZPOOL_PROP_DELEGATION:
 				spa->spa_delegation = intval;
 				break;
 			case ZPOOL_PROP_BOOTFS:
 				spa->spa_bootfs = intval;
 				break;
 			case ZPOOL_PROP_FAILUREMODE:
 				spa->spa_failmode = intval;
 				break;
 			case ZPOOL_PROP_AUTOEXPAND:
 				spa->spa_autoexpand = intval;
 				if (tx->tx_txg != TXG_INITIAL)
 					spa_async_request(spa,
 					    SPA_ASYNC_AUTOEXPAND);
 				break;
 			case ZPOOL_PROP_DEDUPDITTO:
 				spa->spa_dedup_ditto = intval;
 				break;
 			default:
 				break;
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
  * Perform one-time upgrade on-disk changes.  spa_version() does not
  * reflect the new version this txg, so there must be no changes this
  * txg to anything that the upgrade code depends on after it executes.
  * Therefore this must be called after dsl_pool_sync() does the sync
  * tasks.
  */
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	ASSERT(spa->spa_sync_pass == 1);
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
 		dsl_pool_create_origin(dp, tx);
 
 		/* Keeping the origin open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
 		dsl_pool_upgrade_clones(dp, tx);
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
 		dsl_pool_upgrade_dir_clones(dp, tx);
 
 		/* Keeping the freedir open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		spa_feature_create_zap_objects(spa, tx);
 	}
 
 	/*
 	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
 	 * when possibility to use lz4 compression for metadata was added
 	 * Old pools that have this feature enabled must be upgraded to have
 	 * this feature active
 	 */
 	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		boolean_t lz4_en = spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 		boolean_t lz4_ac = spa_feature_is_active(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 
 		if (lz4_en && !lz4_ac)
 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
 	}
 
 	/*
 	 * If we haven't written the salt, do so now.  Note that the
 	 * feature may not be activated yet, but that's fine since
 	 * the presence of this ZAP entry is backwards compatible.
 	 */
 	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
 		VERIFY0(zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes),
 		    spa->spa_cksum_salt.zcs_bytes, tx));
 	}
 
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
 /*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int error;
 
 	VERIFY(spa_writeable(spa));
 
 	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	while (list_head(&spa->spa_state_dirty_list) != NULL) {
 		/*
 		 * We need the write lock here because, for aux vdevs,
 		 * calling vdev_config_dirty() modifies sav_config.
 		 * This is ugly and will become unnecessary when we
 		 * eliminate the aux vdev wart by integrating all vdevs
 		 * into the root vdev tree.
 		 */
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 		}
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	spa->spa_sync_starttime = gethrtime();
 #ifdef illumos
 	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
 	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
 #else	/* !illumos */
 #ifdef _KERNEL
 	callout_reset(&spa->spa_deadman_cycid,
 	    hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
 #endif
 #endif	/* illumos */
 
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
 	 */
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		int i;
 
 		for (i = 0; i < rvd->vdev_children; i++) {
 			vd = rvd->vdev_child[i];
 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
 				break;
 		}
 		if (i == rvd->vdev_children) {
 			spa->spa_deflate = TRUE;
 			VERIFY(0 == zap_add(spa->spa_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
 		}
 	}
 
 	/*
 	 * Iterate to convergence.
 	 */
 	do {
 		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
 		if (pass < zfs_sync_pass_deferred_free) {
 			spa_sync_frees(spa, free_bpl, tx);
 		} else {
 			/*
 			 * We can not defer frees in pass 1, because
 			 * we sync the deferred frees later in pass 1.
 			 */
 			ASSERT3U(pass, >, 1);
 			bplist_iterate(free_bpl, bpobj_enqueue_cb,
 			    &spa->spa_deferred_bpobj, tx);
 		}
 
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
 
 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 			vdev_sync(vd, txg);
 
 		if (pass == 1) {
 			spa_sync_upgrades(spa, tx);
 			ASSERT3U(txg, >=,
 			    spa->spa_uberblock.ub_rootbp.blk_birth);
 			/*
 			 * Note: We need to check if the MOS is dirty
 			 * because we could have marked the MOS dirty
 			 * without updating the uberblock (e.g. if we
 			 * have sync tasks but no dirty user data).  We
 			 * need to check the uberblock's rootbp because
 			 * it is updated if we have synced out dirty
 			 * data (though in this case the MOS will most
 			 * likely also be dirty due to second order
 			 * effects, we don't want to rely on that here).
 			 */
 			if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
 			    !dmu_objset_is_dirty(mos, txg)) {
 				/*
 				 * Nothing changed on the first pass,
 				 * therefore this TXG is a no-op.  Avoid
 				 * syncing deferred frees, so that we
 				 * can keep this TXG as a no-op.
 				 */
 				ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
 				    txg));
 				ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 				ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
 				break;
 			}
 			spa_sync_deferred_frees(spa, tx);
 		}
 
 	} while (dmu_objset_is_dirty(mos, txg));
 
 	/*
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
 	 *
 	 * If there are no dirty vdevs, we sync the uberblock to a few
 	 * random top-level vdevs that are known to be visible in the
 	 * config cache (see spa_vdev_add() for a complete description).
 	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
 	 */
 	for (;;) {
 		/*
 		 * We hold SCL_STATE to prevent vdev open/close/etc.
 		 * while we're attempting to write the vdev labels.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
 			vdev_t *svd[SPA_DVAS_PER_BP];
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = spa_get_random(children);
 
 			for (int c = 0; c < children; c++) {
 				vd = rvd->vdev_child[(c0 + c) % children];
 				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
 					continue;
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_DVAS_PER_BP)
 					break;
 			}
 			error = vdev_config_sync(svd, svdcount, txg);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
 			    rvd->vdev_children, txg);
 		}
 
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
 			break;
 		zio_suspend(spa, NULL);
 		zio_resume_wait(spa);
 	}
 	dmu_tx_commit(tx);
 
 #ifdef illumos
 	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
 #else	/* !illumos */
 #ifdef _KERNEL
 	callout_drain(&spa->spa_deadman_cycid);
 #endif
 #endif	/* illumos */
 
 	/*
 	 * Clear the dirty config list.
 	 */
 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 		vdev_config_clean(vd);
 
 	/*
 	 * Now that the new config has synced transactionally,
 	 * let it become visible to the config cache.
 	 */
 	if (spa->spa_config_syncing != NULL) {
 		spa_config_set(spa, spa->spa_config_syncing);
 		spa->spa_config_txg = txg;
 		spa->spa_config_syncing = NULL;
 	}
 
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	dsl_pool_sync_done(dp, txg);
 
 	/*
 	 * Update usable space statistics.
 	 */
 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 		vdev_sync_done(vd, txg);
 
 	spa_update_dspace(spa);
 
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
 	 */
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
 	spa->spa_sync_pass = 0;
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
 
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
 	spa_async_dispatch_vd(spa);
 }
 
 /*
  * Sync all pools.  We don't want to hold the namespace lock across these
  * operations, so we take a reference on the spa_t and drop the lock during the
  * sync.
  */
 void
 spa_sync_allpools(void)
 {
 	spa_t *spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
 		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * ==========================================================================
  * Miscellaneous routines
  * ==========================================================================
  */
 
 /*
  * Remove all pools in the system.
  */
 void
 spa_evict_all(void)
 {
 	spa_t *spa;
 
 	/*
 	 * Remove all cached state.  All pools should be closed now,
 	 * so every spa in the AVL tree should be unreferenced.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
 		 * Stop async tasks.  The async thread may need to detach
 		 * a device that's been replaced, which requires grabbing
 		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		spa_async_suspend(spa);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 			spa_unload(spa);
 			spa_deactivate(spa);
 		}
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 vdev_t *
 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
 
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
 	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 	}
 
 	return (NULL);
 }
 
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * This should only be called for a non-faulted pool, and since a
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
 	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 }
 
 boolean_t
 spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	int i;
 	uint64_t spareguid;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++)
 		if (sav->sav_vdevs[i]->vdev_guid == guid)
 			return (B_TRUE);
 
 	for (i = 0; i < sav->sav_npending; i++) {
 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
 		    &spareguid) == 0 && spareguid == guid)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Check if a pool has an active shared spare device.
  * Note: reference count of an active spare is 2, as a spare and as a replace
  */
 static boolean_t
 spa_has_active_shared_spare(spa_t *spa)
 {
 	int i, refcnt;
 	uint64_t pool;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
 		    refcnt > 2)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Post a sysevent corresponding to the given event.  The 'name' must be one of
  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
  * in the userland libzpool, as we don't want consumers to misinterpret ztest
  * or zdb as real changes.
  */
 void
 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
 {
 #ifdef _KERNEL
 	sysevent_t		*ev;
 	sysevent_attr_list_t	*attr = NULL;
 	sysevent_value_t	value;
 	sysevent_id_t		eid;
 
 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
 	    SE_SLEEP);
 
 	value.value_type = SE_DATA_TYPE_STRING;
 	value.value.sv_string = spa_name(spa);
 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
 		goto done;
 
 	value.value_type = SE_DATA_TYPE_UINT64;
 	value.value.sv_uint64 = spa_guid(spa);
 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
 		goto done;
 
 	if (vd) {
 		value.value_type = SE_DATA_TYPE_UINT64;
 		value.value.sv_uint64 = vd->vdev_guid;
 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
 		    SE_SLEEP) != 0)
 			goto done;
 
 		if (vd->vdev_path) {
 			value.value_type = SE_DATA_TYPE_STRING;
 			value.value.sv_string = vd->vdev_path;
 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
 			    &value, SE_SLEEP) != 0)
 				goto done;
 		}
 	}
 
 	if (sysevent_attach_attributes(ev, attr) != 0)
 		goto done;
 	attr = NULL;
 
 	(void) log_sysevent(ev, SE_SLEEP, &eid);
 
 done:
 	if (attr)
 		sysevent_free_attr(attr);
 	sysevent_free(ev);
 #endif
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c	(revision 296519)
@@ -1,545 +1,546 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/utsname.h>
 #include <sys/sunddi.h>
 #include <sys/cred.h>
 #include "zfs_comutil.h"
 #ifdef _KERNEL
 #include <sys/cmn_err.h>
 #include <sys/zone.h>
 #endif
 
 /*
  * Routines to manage the on-disk history log.
  *
  * The history log is stored as a dmu object containing
  * <packed record length, record nvlist> tuples.
  *
  * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
  * "packed record length" is the packed length of the "record nvlist" stored
  * as a little endian uint64_t.
  *
  * The log is implemented as a ring buffer, though the original creation
  * of the pool ('zpool create') is never overwritten.
  *
  * The history log is tracked as object 'spa_t::spa_history'.  The bonus buffer
  * of 'spa_history' stores the offsets for logging/retrieving history as
  * 'spa_history_phys_t'.  'sh_pool_create_len' is the ending offset in bytes of
  * where the 'zpool create' record is stored.  This allows us to never
  * overwrite the original creation of the pool.  'sh_phys_max_off' is the
  * physical ending offset in bytes of the log.  This tells you the length of
  * the buffer. 'sh_eof' is the logical EOF (in bytes).  Whenever a record
  * is added, 'sh_eof' is incremented by the the size of the record.
  * 'sh_eof' is never decremented.  'sh_bof' is the logical BOF (in bytes).
  * This is where the consumer should start reading from after reading in
  * the 'zpool create' portion of the log.
  *
  * 'sh_records_lost' keeps track of how many records have been overwritten
  * and permanently lost.
  */
 
 /* convert a logical offset to physical */
 static uint64_t
 spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
 {
 	uint64_t phys_len;
 
 	phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
 	return ((log_off - shpp->sh_pool_create_len) % phys_len
 	    + shpp->sh_pool_create_len);
 }
 
 void
 spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
 {
 	dmu_buf_t *dbp;
 	spa_history_phys_t *shpp;
 	objset_t *mos = spa->spa_meta_objset;
 
 	ASSERT(spa->spa_history == 0);
 	spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
 	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
 	    sizeof (spa_history_phys_t), tx);
 
 	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_HISTORY, sizeof (uint64_t), 1,
 	    &spa->spa_history, tx) == 0);
 
 	VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
 	ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
 
 	shpp = dbp->db_data;
 	dmu_buf_will_dirty(dbp, tx);
 
 	/*
 	 * Figure out maximum size of history log.  We set it at
 	 * 0.1% of pool size, with a max of 1G and min of 128KB.
 	 */
 	shpp->sh_phys_max_off =
 	    metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
 	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
 	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
 
 	dmu_buf_rele(dbp, FTAG);
 }
 
 /*
  * Change 'sh_bof' to the beginning of the next record.
  */
 static int
 spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t firstread, reclen, phys_bof;
 	char buf[sizeof (reclen)];
 	int err;
 
 	phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
 	firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
 
 	if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
 	    buf, DMU_READ_PREFETCH)) != 0)
 		return (err);
 	if (firstread != sizeof (reclen)) {
 		if ((err = dmu_read(mos, spa->spa_history,
 		    shpp->sh_pool_create_len, sizeof (reclen) - firstread,
 		    buf + firstread, DMU_READ_PREFETCH)) != 0)
 			return (err);
 	}
 
 	reclen = LE_64(*((uint64_t *)buf));
 	shpp->sh_bof += reclen + sizeof (reclen);
 	shpp->sh_records_lost++;
 	return (0);
 }
 
 static int
 spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
     dmu_tx_t *tx)
 {
 	uint64_t firstwrite, phys_eof;
 	objset_t *mos = spa->spa_meta_objset;
 	int err;
 
 	ASSERT(MUTEX_HELD(&spa->spa_history_lock));
 
 	/* see if we need to reset logical BOF */
 	while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
 	    (shpp->sh_eof - shpp->sh_bof) <= len) {
 		if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
 			return (err);
 		}
 	}
 
 	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
 	firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
 	shpp->sh_eof += len;
 	dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
 
 	len -= firstwrite;
 	if (len > 0) {
 		/* write out the rest at the beginning of physical file */
 		dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
 		    len, (char *)buf + firstwrite, tx);
 	}
 
 	return (0);
 }
 
 static char *
 spa_history_zone(void)
 {
 #ifdef _KERNEL
 	/* XXX: pr_hostname can be changed by default from within a jail! */
 	if (jailed(curthread->td_ucred))
 		return (curthread->td_ucred->cr_prison->pr_hostname);
 #endif
 	return (NULL);
 }
 
 /*
  * Write out a history event.
  */
 /*ARGSUSED*/
 static void
 spa_history_log_sync(void *arg, dmu_tx_t *tx)
 {
 	nvlist_t	*nvl = arg;
 	spa_t		*spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t	*mos = spa->spa_meta_objset;
 	dmu_buf_t	*dbp;
 	spa_history_phys_t *shpp;
 	size_t		reclen;
 	uint64_t	le_len;
 	char		*record_packed = NULL;
 	int		ret;
 
 	/*
 	 * If we have an older pool that doesn't have a command
 	 * history object, create it now.
 	 */
 	mutex_enter(&spa->spa_history_lock);
 	if (!spa->spa_history)
 		spa_history_create_obj(spa, tx);
 	mutex_exit(&spa->spa_history_lock);
 
 	/*
 	 * Get the offset of where we need to write via the bonus buffer.
 	 * Update the offset when the write completes.
 	 */
 	VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
 	shpp = dbp->db_data;
 
 	dmu_buf_will_dirty(dbp, tx);
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(dbp, &doi);
 		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
 	}
 #endif
 
 	fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
 #ifdef _KERNEL
 	fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename);
 #endif
 	if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
 		zfs_dbgmsg("command: %s",
 		    fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
 	} else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
 		if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
 			zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
 			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
 			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
 		} else {
 			zfs_dbgmsg("txg %lld %s %s",
 			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
 		}
 	} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
 		zfs_dbgmsg("ioctl %s",
 		    fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
 	}
 
 	record_packed = fnvlist_pack(nvl, &reclen);
 
 	mutex_enter(&spa->spa_history_lock);
 
 	/* write out the packed length as little endian */
 	le_len = LE_64((uint64_t)reclen);
 	ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
 	if (!ret)
 		ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
 
 	/* The first command is the create, which we keep forever */
 	if (ret == 0 && shpp->sh_pool_create_len == 0 &&
 	    nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
 		shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
 	}
 
 	mutex_exit(&spa->spa_history_lock);
 	fnvlist_pack_free(record_packed, reclen);
 	dmu_buf_rele(dbp, FTAG);
 	fnvlist_free(nvl);
 }
 
 /*
  * Write out a history event.
  */
 int
 spa_history_log(spa_t *spa, const char *msg)
 {
 	int err;
 	nvlist_t *nvl = fnvlist_alloc();
 
 	fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
 	err = spa_history_log_nvl(spa, nvl);
 	fnvlist_free(nvl);
 	return (err);
 }
 
 int
 spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
 {
 	int err = 0;
 	dmu_tx_t *tx;
 	nvlist_t *nvarg;
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY)
 		return (EINVAL);
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
 		return (SET_ERROR(EINVAL));
 
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	nvarg = fnvlist_dup(nvl);
 	if (spa_history_zone() != NULL) {
 		fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
 		    spa_history_zone());
 	}
 	fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
 
 	/* Kick this off asynchronously; errors are ignored. */
 	dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
 	    nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
 	dmu_tx_commit(tx);
 
 	/* spa_history_log_sync will free nvl */
 	return (err);
 
 }
 
 /*
  * Read out the command history.
  */
 int
 spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	dmu_buf_t *dbp;
 	uint64_t read_len, phys_read_off, phys_eof;
 	uint64_t leftover = 0;
 	spa_history_phys_t *shpp;
 	int err;
 
 	/*
 	 * If the command history doesn't exist (older pool),
 	 * that's ok, just return ENOENT.
 	 */
 	if (!spa->spa_history)
 		return (SET_ERROR(ENOENT));
 
 	/*
 	 * The history is logged asynchronously, so when they request
 	 * the first chunk of history, make sure everything has been
 	 * synced to disk so that we get it.
 	 */
 	if (*offp == 0 && spa_writeable(spa))
 		txg_wait_synced(spa_get_dsl(spa), 0);
 
 	if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
 		return (err);
 	shpp = dbp->db_data;
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(dbp, &doi);
 		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
 	}
 #endif
 
 	mutex_enter(&spa->spa_history_lock);
 	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
 
 	if (*offp < shpp->sh_pool_create_len) {
 		/* read in just the zpool create history */
 		phys_read_off = *offp;
 		read_len = MIN(*len, shpp->sh_pool_create_len -
 		    phys_read_off);
 	} else {
 		/*
 		 * Need to reset passed in offset to BOF if the passed in
 		 * offset has since been overwritten.
 		 */
 		*offp = MAX(*offp, shpp->sh_bof);
 		phys_read_off = spa_history_log_to_phys(*offp, shpp);
 
 		/*
 		 * Read up to the minimum of what the user passed down or
 		 * the EOF (physical or logical).  If we hit physical EOF,
 		 * use 'leftover' to read from the physical BOF.
 		 */
 		if (phys_read_off <= phys_eof) {
 			read_len = MIN(*len, phys_eof - phys_read_off);
 		} else {
 			read_len = MIN(*len,
 			    shpp->sh_phys_max_off - phys_read_off);
 			if (phys_read_off + *len > shpp->sh_phys_max_off) {
 				leftover = MIN(*len - read_len,
 				    phys_eof - shpp->sh_pool_create_len);
 			}
 		}
 	}
 
 	/* offset for consumer to use next */
 	*offp += read_len + leftover;
 
 	/* tell the consumer how much you actually read */
 	*len = read_len + leftover;
 
 	if (read_len == 0) {
 		mutex_exit(&spa->spa_history_lock);
 		dmu_buf_rele(dbp, FTAG);
 		return (0);
 	}
 
 	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
 	    DMU_READ_PREFETCH);
 	if (leftover && err == 0) {
 		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
 		    leftover, buf + read_len, DMU_READ_PREFETCH);
 	}
 	mutex_exit(&spa->spa_history_lock);
 
 	dmu_buf_rele(dbp, FTAG);
 	return (err);
 }
 
 /*
  * The nvlist will be consumed by this call.
  */
 static void
 log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
     dmu_tx_t *tx, const char *fmt, va_list adx)
 {
 	char *msg;
 	va_list adx2;
 
 	/*
 	 * If this is part of creating a pool, not everything is
 	 * initialized yet, so don't bother logging the internal events.
 	 * Likewise if the pool is not writeable.
 	 */
 	if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) {
 		fnvlist_free(nvl);
 		return;
 	}
 
 	va_copy(adx2, adx);
 
 	msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
 	(void) vsprintf(msg, fmt, adx2);
 	fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
 	strfree(msg);
 
 	va_end(adx2);
 
 	fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
 	fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
 
 	if (dmu_tx_is_syncing(tx)) {
 		spa_history_log_sync(nvl, tx);
 	} else {
 		dsl_sync_task_nowait(spa_get_dsl(spa),
 		    spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
 	}
 	/* spa_history_log_sync() will free nvl */
 }
 
 void
 spa_history_log_internal(spa_t *spa, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...)
 {
 	dmu_tx_t *htx = tx;
 	va_list adx;
 
 	/* create a tx if we didn't get one */
 	if (tx == NULL) {
 		htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 		if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
 			dmu_tx_abort(htx);
 			return;
 		}
 	}
 
 	va_start(adx, fmt);
 	log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
 	va_end(adx);
 
 	/* if we didn't get a tx from the caller, commit the one we made */
 	if (tx == NULL)
 		dmu_tx_commit(htx);
 }
 
 void
 spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...)
 {
 	va_list adx;
 	char namebuf[MAXNAMELEN];
 	nvlist_t *nvl = fnvlist_alloc();
 
 	ASSERT(tx != NULL);
 
 	dsl_dataset_name(ds, namebuf);
 	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
 	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
 
 	va_start(adx, fmt);
 	log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
 	va_end(adx);
 }
 
 void
 spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...)
 {
 	va_list adx;
 	char namebuf[MAXNAMELEN];
 	nvlist_t *nvl = fnvlist_alloc();
 
 	ASSERT(tx != NULL);
 
 	dsl_dir_name(dd, namebuf);
 	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
 	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
 	    dsl_dir_phys(dd)->dd_head_dataset_obj);
 
 	va_start(adx, fmt);
 	log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
 	va_end(adx);
 }
 
 void
 spa_history_log_version(spa_t *spa, const char *operation)
 {
 	spa_history_log_internal(spa, operation, NULL,
 	    "pool version %llu; software version %llu/%d; uts %s %s %s %s",
 	    (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION,
 	    utsname.nodename, utsname.release, utsname.version,
 	    utsname.machine);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	(revision 296519)
@@ -1,2126 +1,2127 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/spa_boot.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_scan.h>
 #include <sys/fs/zfs.h>
 #include <sys/metaslab_impl.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include "zfs_prop.h"
 #include <sys/zfeature.h>
 
 /*
  * SPA locking
  *
  * There are four basic locks for managing spa_t structures:
  *
  * spa_namespace_lock (global mutex)
  *
  *	This lock must be acquired to do any of the following:
  *
  *		- Lookup a spa_t by name
  *		- Add or remove a spa_t from the namespace
  *		- Increase spa_refcount from non-zero
  *		- Check if spa_refcount is zero
  *		- Rename a spa_t
  *		- add/remove/attach/detach devices
  *		- Held for the duration of create/destroy/import/export
  *
  *	It does not need to handle recursion.  A create or destroy may
  *	reference objects (files or zvols) in other pools, but by
  *	definition they must have an existing reference, and will never need
  *	to lookup a spa_t by name.
  *
  * spa_refcount (per-spa refcount_t protected by mutex)
  *
  *	This reference count keep track of any active users of the spa_t.  The
  *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
  *	the refcount is never really 'zero' - opening a pool implicitly keeps
  *	some references in the DMU.  Internally we check against spa_minref, but
  *	present the image of a zero/non-zero value to consumers.
  *
  * spa_config_lock[] (per-spa array of rwlocks)
  *
  *	This protects the spa_t from config changes, and must be held in
  *	the following circumstances:
  *
  *		- RW_READER to perform I/O to the spa
  *		- RW_WRITER to change the vdev config
  *
  * The locking order is fairly straightforward:
  *
  *		spa_namespace_lock	->	spa_refcount
  *
  *	The namespace lock must be acquired to increase the refcount from 0
  *	or to check if it is zero.
  *
  *		spa_refcount		->	spa_config_lock[]
  *
  *	There must be at least one valid reference on the spa_t to acquire
  *	the config lock.
  *
  *		spa_namespace_lock	->	spa_config_lock[]
  *
  *	The namespace lock must always be taken before the config lock.
  *
  *
  * The spa_namespace_lock can be acquired directly and is globally visible.
  *
  * The namespace is manipulated using the following functions, all of which
  * require the spa_namespace_lock to be held.
  *
  *	spa_lookup()		Lookup a spa_t by name.
  *
  *	spa_add()		Create a new spa_t in the namespace.
  *
  *	spa_remove()		Remove a spa_t from the namespace.  This also
  *				frees up any memory associated with the spa_t.
  *
  *	spa_next()		Returns the next spa_t in the system, or the
  *				first if NULL is passed.
  *
  *	spa_evict_all()		Shutdown and remove all spa_t structures in
  *				the system.
  *
  *	spa_guid_exists()	Determine whether a pool/device guid exists.
  *
  * The spa_refcount is manipulated using the following functions:
  *
  *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
  *				called with spa_namespace_lock held if the
  *				refcount is currently zero.
  *
  *	spa_close()		Remove a reference from the spa_t.  This will
  *				not free the spa_t or remove it from the
  *				namespace.  No locking is required.
  *
  *	spa_refcount_zero()	Returns true if the refcount is currently
  *				zero.  Must be called with spa_namespace_lock
  *				held.
  *
  * The spa_config_lock[] is an array of rwlocks, ordered as follows:
  * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
  * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
  *
  * To read the configuration, it suffices to hold one of these locks as reader.
  * To modify the configuration, you must hold all locks as writer.  To modify
  * vdev state without altering the vdev tree's topology (e.g. online/offline),
  * you must hold SCL_STATE and SCL_ZIO as writer.
  *
  * We use these distinct config locks to avoid recursive lock entry.
  * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
  * block allocations (SCL_ALLOC), which may require reading space maps
  * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
  *
  * The spa config locks cannot be normal rwlocks because we need the
  * ability to hand off ownership.  For example, SCL_ZIO is acquired
  * by the issuing thread and later released by an interrupt thread.
  * They do, however, obey the usual write-wanted semantics to prevent
  * writer (i.e. system administrator) starvation.
  *
  * The lock acquisition rules are as follows:
  *
  * SCL_CONFIG
  *	Protects changes to the vdev tree topology, such as vdev
  *	add/remove/attach/detach.  Protects the dirty config list
  *	(spa_config_dirty_list) and the set of spares and l2arc devices.
  *
  * SCL_STATE
  *	Protects changes to pool state and vdev state, such as vdev
  *	online/offline/fault/degrade/clear.  Protects the dirty state list
  *	(spa_state_dirty_list) and global pool state (spa_state).
  *
  * SCL_ALLOC
  *	Protects changes to metaslab groups and classes.
  *	Held as reader by metaslab_alloc() and metaslab_claim().
  *
  * SCL_ZIO
  *	Held by bp-level zios (those which have no io_vd upon entry)
  *	to prevent changes to the vdev tree.  The bp-level zio implicitly
  *	protects all of its vdev child zios, which do not hold SCL_ZIO.
  *
  * SCL_FREE
  *	Protects changes to metaslab groups and classes.
  *	Held as reader by metaslab_free().  SCL_FREE is distinct from
  *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
  *	blocks in zio_done() while another i/o that holds either
  *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
  *
  * SCL_VDEV
  *	Held as reader to prevent changes to the vdev tree during trivial
  *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
  *	other locks, and lower than all of them, to ensure that it's safe
  *	to acquire regardless of caller context.
  *
  * In addition, the following rules apply:
  *
  * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
  *	The lock ordering is SCL_CONFIG > spa_props_lock.
  *
  * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
  *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
  *	or zio_write_phys() -- the caller must ensure that the config cannot
  *	cannot change in the interim, and that the vdev cannot be reopened.
  *	SCL_STATE as reader suffices for both.
  *
  * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
  *
  *	spa_vdev_enter()	Acquire the namespace lock and the config lock
  *				for writing.
  *
  *	spa_vdev_exit()		Release the config lock, wait for all I/O
  *				to complete, sync the updated configs to the
  *				cache, and release the namespace lock.
  *
  * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
  * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
  *
  * spa_rename() is also implemented within this file since it requires
  * manipulation of the namespace.
  */
 
 static avl_tree_t spa_namespace_avl;
 kmutex_t spa_namespace_lock;
 static kcondvar_t spa_namespace_cv;
 static int spa_active_count;
 int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 static kmutex_t spa_spare_lock;
 static avl_tree_t spa_spare_avl;
 static kmutex_t spa_l2cache_lock;
 static avl_tree_t spa_l2cache_avl;
 
 kmem_cache_t *spa_buffer_pool;
 int spa_mode_global;
 
 #ifdef ZFS_DEBUG
 /* Everything except dprintf and spa is on by default in debug builds */
 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
 #else
 int zfs_flags = 0;
 #endif
 
 /*
  * zfs_recover can be set to nonzero to attempt to recover from
  * otherwise-fatal errors, typically caused by on-disk corruption.  When
  * set, calls to zfs_panic_recover() will turn into warning messages.
  * This should only be used as a last resort, as it typically results
  * in leaked space, or worse.
  */
 boolean_t zfs_recover = B_FALSE;
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
     "Try to recover from otherwise-fatal errors.");
 
 static int
 sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
 {
 	int err, val;
 
 	val = zfs_flags;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	/*
 	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
 	 * arc buffers in the system have the necessary additional
 	 * checksum data.  However, it is safe to disable at any
 	 * time.
 	 */
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		val &= ~ZFS_DEBUG_MODIFY;
 	zfs_flags = val;
 
 	return (0);
 }
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags,
     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
     sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
 
 /*
  * If destroy encounters an EIO while reading metadata (e.g. indirect
  * blocks), space referenced by the missing metadata can not be freed.
  * Normally this causes the background destroy to become "stalled", as
  * it is unable to make forward progress.  While in this stalled state,
  * all remaining space to free from the error-encountering filesystem is
  * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
  * permanently leak the space from indirect blocks that can not be read,
  * and continue to free everything else that it can.
  *
  * The default, "stalling" behavior is useful if the storage partially
  * fails (i.e. some but not all i/os fail), and then later recovers.  In
  * this case, we will be able to continue pool operations while it is
  * partially failed, and when it recovers, we can continue to free the
  * space, with no leaks.  However, note that this case is actually
  * fairly rare.
  *
  * Typically pools either (a) fail completely (but perhaps temporarily,
  * e.g. a top-level vdev going offline), or (b) have localized,
  * permanent errors (e.g. disk returns the wrong data due to bit flip or
  * firmware bug).  In case (a), this setting does not matter because the
  * pool will be suspended and the sync thread will not be able to make
  * forward progress regardless.  In case (b), because the error is
  * permanent, the best we can do is leak the minimum amount of space,
  * which is what setting this flag will do.  Therefore, it is reasonable
  * for this flag to normally be set, but we chose the more conservative
  * approach of not setting it, so that there is no possibility of
  * leaking space in the "partial temporary" failure case.
  */
 boolean_t zfs_free_leak_on_eio = B_FALSE;
 
 /*
  * Expiration time in milliseconds. This value has two meanings. First it is
  * used to determine when the spa_deadman() logic should fire. By default the
  * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
  * Secondly, the value determines if an I/O is considered "hung". Any I/O that
  * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
  * in a system panic.
  */
 uint64_t zfs_deadman_synctime_ms = 1000000ULL;
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN,
     &zfs_deadman_synctime_ms, 0,
     "Stalled ZFS I/O expiration time in milliseconds");
 
 /*
  * Check time in milliseconds. This defines the frequency at which we check
  * for hung I/O.
  */
 uint64_t zfs_deadman_checktime_ms = 5000ULL;
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN,
     &zfs_deadman_checktime_ms, 0,
     "Period of checks for stalled ZFS I/O in milliseconds");
 
 /*
  * Default value of -1 for zfs_deadman_enabled is resolved in
  * zfs_deadman_init()
  */
 int zfs_deadman_enabled = -1;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
     &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
 
 /*
  * The worst case is single-sector max-parity RAID-Z blocks, in which
  * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
  * times the size; so just assume that.  Add to this the fact that
  * we can have up to 3 DVAs per bp, and one more factor of 2 because
  * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
  * the worst case is:
  *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
  */
 int spa_asize_inflation = 24;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
     &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
 
 #ifndef illumos
 #ifdef _KERNEL
 static void
 zfs_deadman_init()
 {
 	/*
 	 * If we are not i386 or amd64 or in a virtual machine,
 	 * disable ZFS deadman thread by default
 	 */
 	if (zfs_deadman_enabled == -1) {
 #if defined(__amd64__) || defined(__i386__)
 		zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
 #else
 		zfs_deadman_enabled = 0;
 #endif
 	}
 }
 #endif	/* _KERNEL */
 #endif	/* !illumos */
 
 /*
  * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
  * the pool to be consumed.  This ensures that we don't run the pool
  * completely out of space, due to unaccounted changes (e.g. to the MOS).
  * It also limits the worst-case time to allocate space.  If we have
  * less than this amount of free space, most ZPL operations (e.g. write,
  * create) will return ENOSPC.
  *
  * Certain operations (e.g. file removal, most administrative actions) can
  * use half the slop space.  They will only return ENOSPC if less than half
  * the slop space is free.  Typically, once the pool has less than the slop
  * space free, the user will use these operations to free up space in the pool.
  * These are the operations that call dsl_pool_adjustedsize() with the netfree
  * argument set to TRUE.
  *
  * A very restricted set of operations are always permitted, regardless of
  * the amount of free space.  These are the operations that call
  * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy".  If these
  * operations result in a net increase in the amount of space used,
  * it is possible to run the pool completely out of space, causing it to
  * be permanently read-only.
  *
  * See also the comments in zfs_space_check_t.
  */
 int spa_slop_shift = 5;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
     &spa_slop_shift, 0,
     "Shift value of reserved space (1/(2^spa_slop_shift)).");
 
 /*
  * ==========================================================================
  * SPA config locking
  * ==========================================================================
  */
 static void
 spa_config_lock_init(spa_t *spa)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
 		refcount_create_untracked(&scl->scl_count);
 		scl->scl_writer = NULL;
 		scl->scl_write_wanted = 0;
 	}
 }
 
 static void
 spa_config_lock_destroy(spa_t *spa)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		mutex_destroy(&scl->scl_lock);
 		cv_destroy(&scl->scl_cv);
 		refcount_destroy(&scl->scl_count);
 		ASSERT(scl->scl_writer == NULL);
 		ASSERT(scl->scl_write_wanted == 0);
 	}
 }
 
 int
 spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
 		if (rw == RW_READER) {
 			if (scl->scl_writer || scl->scl_write_wanted) {
 				mutex_exit(&scl->scl_lock);
 				spa_config_exit(spa, locks & ((1 << i) - 1),
 				    tag);
 				return (0);
 			}
 		} else {
 			ASSERT(scl->scl_writer != curthread);
 			if (!refcount_is_zero(&scl->scl_count)) {
 				mutex_exit(&scl->scl_lock);
 				spa_config_exit(spa, locks & ((1 << i) - 1),
 				    tag);
 				return (0);
 			}
 			scl->scl_writer = curthread;
 		}
 		(void) refcount_add(&scl->scl_count, tag);
 		mutex_exit(&scl->scl_lock);
 	}
 	return (1);
 }
 
 void
 spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
 {
 	int wlocks_held = 0;
 
 	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
 
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (scl->scl_writer == curthread)
 			wlocks_held |= (1 << i);
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
 		if (rw == RW_READER) {
 			while (scl->scl_writer || scl->scl_write_wanted) {
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 			}
 		} else {
 			ASSERT(scl->scl_writer != curthread);
 			while (!refcount_is_zero(&scl->scl_count)) {
 				scl->scl_write_wanted++;
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 				scl->scl_write_wanted--;
 			}
 			scl->scl_writer = curthread;
 		}
 		(void) refcount_add(&scl->scl_count, tag);
 		mutex_exit(&scl->scl_lock);
 	}
 	ASSERT(wlocks_held <= locks);
 }
 
 void
 spa_config_exit(spa_t *spa, int locks, void *tag)
 {
 	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
 		ASSERT(!refcount_is_zero(&scl->scl_count));
 		if (refcount_remove(&scl->scl_count, tag) == 0) {
 			ASSERT(scl->scl_writer == NULL ||
 			    scl->scl_writer == curthread);
 			scl->scl_writer = NULL;	/* OK in either case */
 			cv_broadcast(&scl->scl_cv);
 		}
 		mutex_exit(&scl->scl_lock);
 	}
 }
 
 int
 spa_config_held(spa_t *spa, int locks, krw_t rw)
 {
 	int locks_held = 0;
 
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
 		if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
 		    (rw == RW_WRITER && scl->scl_writer == curthread))
 			locks_held |= 1 << i;
 	}
 
 	return (locks_held);
 }
 
 /*
  * ==========================================================================
  * SPA namespace functions
  * ==========================================================================
  */
 
 /*
  * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
  * Returns NULL if no matching spa_t is found.
  */
 spa_t *
 spa_lookup(const char *name)
 {
 	static spa_t search;	/* spa_t is large; don't allocate on stack */
 	spa_t *spa;
 	avl_index_t where;
 	char *cp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 
 	/*
 	 * If it's a full dataset name, figure out the pool name and
 	 * just use that.
 	 */
 	cp = strpbrk(search.spa_name, "/@#");
 	if (cp != NULL)
 		*cp = '\0';
 
 	spa = avl_find(&spa_namespace_avl, &search, &where);
 
 	return (spa);
 }
 
 /*
  * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
  * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
  * looking for potentially hung I/Os.
  */
 void
 spa_deadman(void *arg)
 {
 	spa_t *spa = arg;
 
 	/*
 	 * Disable the deadman timer if the pool is suspended.
 	 */
 	if (spa_suspended(spa)) {
 #ifdef illumos
 		VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
 #else
 		/* Nothing.  just don't schedule any future callouts. */
 #endif
 		return;
 	}
 
 	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
 	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
 	    ++spa->spa_deadman_calls);
 	if (zfs_deadman_enabled)
 		vdev_deadman(spa->spa_root_vdev);
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	callout_schedule(&spa->spa_deadman_cycid,
 	    hz * zfs_deadman_checktime_ms / MILLISEC);
 #endif
 #endif
 }
 
 /*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
  * exist by calling spa_lookup() first.
  */
 spa_t *
 spa_add(const char *name, nvlist_t *config, const char *altroot)
 {
 	spa_t *spa;
 	spa_config_dirent_t *dp;
 #ifdef illumos
 	cyc_handler_t hdlr;
 	cyc_time_t when;
 #endif
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 
 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		bplist_create(&spa->spa_free_bplist[t]);
 
 	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 	spa->spa_freeze_txg = UINT64_MAX;
 	spa->spa_final_txg = UINT64_MAX;
 	spa->spa_load_max_txg = UINT64_MAX;
 	spa->spa_proc = &p0;
 	spa->spa_proc_state = SPA_PROC_NONE;
 
 #ifdef illumos
 	hdlr.cyh_func = spa_deadman;
 	hdlr.cyh_arg = spa;
 	hdlr.cyh_level = CY_LOW_LEVEL;
 #endif
 
 	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 
 #ifdef illumos
 	/*
 	 * This determines how often we need to check for hung I/Os after
 	 * the cyclic has already fired. Since checking for hung I/Os is
 	 * an expensive operation we don't want to check too frequently.
 	 * Instead wait for 5 seconds before checking again.
 	 */
 	when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
 	when.cyt_when = CY_INFINITY;
 	mutex_enter(&cpu_lock);
 	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
 	mutex_exit(&cpu_lock);
 #else	/* !illumos */
 #ifdef _KERNEL
 	callout_init(&spa->spa_deadman_cycid, 1);
 #endif
 #endif
 	refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
 
 	avl_add(&spa_namespace_avl, spa);
 
 	/*
 	 * Set the alternate root, if there is one.
 	 */
 	if (altroot) {
 		spa->spa_root = spa_strdup(altroot);
 		spa_active_count++;
 	}
 
 	/*
 	 * Every pool starts with the default cachefile
 	 */
 	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
 	    offsetof(spa_config_dirent_t, scd_link));
 
 	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
 	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
 	list_insert_head(&spa->spa_config_list, dp);
 
 	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
 	    KM_SLEEP) == 0);
 
 	if (config != NULL) {
 		nvlist_t *features;
 
 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
 		    &features) == 0) {
 			VERIFY(nvlist_dup(features, &spa->spa_label_features,
 			    0) == 0);
 		}
 
 		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
 	}
 
 	if (spa->spa_label_features == NULL) {
 		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 	}
 
 	spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
 
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 
 	/*
 	 * As a pool is being created, treat all features as disabled by
 	 * setting SPA_FEATURE_DISABLED for all entries in the feature
 	 * refcount cache.
 	 */
 	for (int i = 0; i < SPA_FEATURES; i++) {
 		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
 	}
 
 	return (spa);
 }
 
 /*
  * Removes a spa_t from the namespace, freeing up any memory used.  Requires
  * spa_namespace_lock.  This is called only after the spa_t has been closed and
  * deactivated.
  */
 void
 spa_remove(spa_t *spa)
 {
 	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 	ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
 
 	nvlist_free(spa->spa_config_splitting);
 
 	avl_remove(&spa_namespace_avl, spa);
 	cv_broadcast(&spa_namespace_cv);
 
 	if (spa->spa_root) {
 		spa_strfree(spa->spa_root);
 		spa_active_count--;
 	}
 
 	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		list_remove(&spa->spa_config_list, dp);
 		if (dp->scd_path != NULL)
 			spa_strfree(dp->scd_path);
 		kmem_free(dp, sizeof (spa_config_dirent_t));
 	}
 
 	list_destroy(&spa->spa_config_list);
 
 	nvlist_free(spa->spa_label_features);
 	nvlist_free(spa->spa_load_info);
 	spa_config_set(spa, NULL);
 
 #ifdef illumos
 	mutex_enter(&cpu_lock);
 	if (spa->spa_deadman_cycid != CYCLIC_NONE)
 		cyclic_remove(spa->spa_deadman_cycid);
 	mutex_exit(&cpu_lock);
 	spa->spa_deadman_cycid = CYCLIC_NONE;
 #else	/* !illumos */
 #ifdef _KERNEL
 	callout_drain(&spa->spa_deadman_cycid);
 #endif
 #endif
 
 	refcount_destroy(&spa->spa_refcount);
 
 	spa_config_lock_destroy(spa);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		bplist_destroy(&spa->spa_free_bplist[t]);
 
 	zio_checksum_templates_free(spa);
 
 	cv_destroy(&spa->spa_async_cv);
 	cv_destroy(&spa->spa_evicting_os_cv);
 	cv_destroy(&spa->spa_proc_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 
 	mutex_destroy(&spa->spa_async_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
 	mutex_destroy(&spa->spa_evicting_os_lock);
 	mutex_destroy(&spa->spa_history_lock);
 	mutex_destroy(&spa->spa_proc_lock);
 	mutex_destroy(&spa->spa_props_lock);
 	mutex_destroy(&spa->spa_cksum_tmpls_lock);
 	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
 	mutex_destroy(&spa->spa_vdev_top_lock);
 
 	kmem_free(spa, sizeof (spa_t));
 }
 
 /*
  * Given a pool, return the next pool in the namespace, or NULL if there is
  * none.  If 'prev' is NULL, return the first pool.
  */
 spa_t *
 spa_next(spa_t *prev)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	if (prev)
 		return (AVL_NEXT(&spa_namespace_avl, prev));
 	else
 		return (avl_first(&spa_namespace_avl));
 }
 
 /*
  * ==========================================================================
  * SPA refcount functions
  * ==========================================================================
  */
 
 /*
  * Add a reference to the given spa_t.  Must have at least one reference, or
  * have the namespace lock held.
  */
 void
 spa_open_ref(spa_t *spa, void *tag)
 {
 	ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock));
 	(void) refcount_add(&spa->spa_refcount, tag);
 }
 
 /*
  * Remove a reference to the given spa_t.  Must have at least one reference, or
  * have the namespace lock held.
  */
 void
 spa_close(spa_t *spa, void *tag)
 {
 	ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock));
 	(void) refcount_remove(&spa->spa_refcount, tag);
 }
 
 /*
  * Remove a reference to the given spa_t held by a dsl dir that is
  * being asynchronously released.  Async releases occur from a taskq
  * performing eviction of dsl datasets and dirs.  The namespace lock
  * isn't held and the hold by the object being evicted may contribute to
  * spa_minref (e.g. dataset or directory released during pool export),
  * so the asserts in spa_close() do not apply.
  */
 void
 spa_async_close(spa_t *spa, void *tag)
 {
 	(void) refcount_remove(&spa->spa_refcount, tag);
 }
 
 /*
  * Check to see if the spa refcount is zero.  Must be called with
  * spa_namespace_lock held.  We really compare against spa_minref, which is the
  * number of references acquired when opening a pool
  */
 boolean_t
 spa_refcount_zero(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
 }
 
 /*
  * ==========================================================================
  * SPA spare and l2cache tracking
  * ==========================================================================
  */
 
 /*
  * Hot spares and cache devices are tracked using the same code below,
  * for 'auxiliary' devices.
  */
 
 typedef struct spa_aux {
 	uint64_t	aux_guid;
 	uint64_t	aux_pool;
 	avl_node_t	aux_avl;
 	int		aux_count;
 } spa_aux_t;
 
 static int
 spa_aux_compare(const void *a, const void *b)
 {
 	const spa_aux_t *sa = a;
 	const spa_aux_t *sb = b;
 
 	if (sa->aux_guid < sb->aux_guid)
 		return (-1);
 	else if (sa->aux_guid > sb->aux_guid)
 		return (1);
 	else
 		return (0);
 }
 
 void
 spa_aux_add(vdev_t *vd, avl_tree_t *avl)
 {
 	avl_index_t where;
 	spa_aux_t search;
 	spa_aux_t *aux;
 
 	search.aux_guid = vd->vdev_guid;
 	if ((aux = avl_find(avl, &search, &where)) != NULL) {
 		aux->aux_count++;
 	} else {
 		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
 		aux->aux_guid = vd->vdev_guid;
 		aux->aux_count = 1;
 		avl_insert(avl, aux, where);
 	}
 }
 
 void
 spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
 {
 	spa_aux_t search;
 	spa_aux_t *aux;
 	avl_index_t where;
 
 	search.aux_guid = vd->vdev_guid;
 	aux = avl_find(avl, &search, &where);
 
 	ASSERT(aux != NULL);
 
 	if (--aux->aux_count == 0) {
 		avl_remove(avl, aux);
 		kmem_free(aux, sizeof (spa_aux_t));
 	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
 		aux->aux_pool = 0ULL;
 	}
 }
 
 boolean_t
 spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
 {
 	spa_aux_t search, *found;
 
 	search.aux_guid = guid;
 	found = avl_find(avl, &search, NULL);
 
 	if (pool) {
 		if (found)
 			*pool = found->aux_pool;
 		else
 			*pool = 0ULL;
 	}
 
 	if (refcnt) {
 		if (found)
 			*refcnt = found->aux_count;
 		else
 			*refcnt = 0;
 	}
 
 	return (found != NULL);
 }
 
 void
 spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
 {
 	spa_aux_t search, *found;
 	avl_index_t where;
 
 	search.aux_guid = vd->vdev_guid;
 	found = avl_find(avl, &search, &where);
 	ASSERT(found != NULL);
 	ASSERT(found->aux_pool == 0ULL);
 
 	found->aux_pool = spa_guid(vd->vdev_spa);
 }
 
 /*
  * Spares are tracked globally due to the following constraints:
  *
  * 	- A spare may be part of multiple pools.
  * 	- A spare may be added to a pool even if it's actively in use within
  *	  another pool.
  * 	- A spare in use in any pool can only be the source of a replacement if
  *	  the target is a spare in the same pool.
  *
  * We keep track of all spares on the system through the use of a reference
  * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
  * spare, then we bump the reference count in the AVL tree.  In addition, we set
  * the 'vdev_isspare' member to indicate that the device is a spare (active or
  * inactive).  When a spare is made active (used to replace a device in the
  * pool), we also keep track of which pool its been made a part of.
  *
  * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
  * called under the spa_namespace lock as part of vdev reconfiguration.  The
  * separate spare lock exists for the status query path, which does not need to
  * be completely consistent with respect to other vdev configuration changes.
  */
 
 static int
 spa_spare_compare(const void *a, const void *b)
 {
 	return (spa_aux_compare(a, b));
 }
 
 void
 spa_spare_add(vdev_t *vd)
 {
 	mutex_enter(&spa_spare_lock);
 	ASSERT(!vd->vdev_isspare);
 	spa_aux_add(vd, &spa_spare_avl);
 	vd->vdev_isspare = B_TRUE;
 	mutex_exit(&spa_spare_lock);
 }
 
 void
 spa_spare_remove(vdev_t *vd)
 {
 	mutex_enter(&spa_spare_lock);
 	ASSERT(vd->vdev_isspare);
 	spa_aux_remove(vd, &spa_spare_avl);
 	vd->vdev_isspare = B_FALSE;
 	mutex_exit(&spa_spare_lock);
 }
 
 boolean_t
 spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
 {
 	boolean_t found;
 
 	mutex_enter(&spa_spare_lock);
 	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
 	mutex_exit(&spa_spare_lock);
 
 	return (found);
 }
 
 void
 spa_spare_activate(vdev_t *vd)
 {
 	mutex_enter(&spa_spare_lock);
 	ASSERT(vd->vdev_isspare);
 	spa_aux_activate(vd, &spa_spare_avl);
 	mutex_exit(&spa_spare_lock);
 }
 
 /*
  * Level 2 ARC devices are tracked globally for the same reasons as spares.
  * Cache devices currently only support one pool per cache device, and so
  * for these devices the aux reference count is currently unused beyond 1.
  */
 
 static int
 spa_l2cache_compare(const void *a, const void *b)
 {
 	return (spa_aux_compare(a, b));
 }
 
 void
 spa_l2cache_add(vdev_t *vd)
 {
 	mutex_enter(&spa_l2cache_lock);
 	ASSERT(!vd->vdev_isl2cache);
 	spa_aux_add(vd, &spa_l2cache_avl);
 	vd->vdev_isl2cache = B_TRUE;
 	mutex_exit(&spa_l2cache_lock);
 }
 
 void
 spa_l2cache_remove(vdev_t *vd)
 {
 	mutex_enter(&spa_l2cache_lock);
 	ASSERT(vd->vdev_isl2cache);
 	spa_aux_remove(vd, &spa_l2cache_avl);
 	vd->vdev_isl2cache = B_FALSE;
 	mutex_exit(&spa_l2cache_lock);
 }
 
 boolean_t
 spa_l2cache_exists(uint64_t guid, uint64_t *pool)
 {
 	boolean_t found;
 
 	mutex_enter(&spa_l2cache_lock);
 	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
 	mutex_exit(&spa_l2cache_lock);
 
 	return (found);
 }
 
 void
 spa_l2cache_activate(vdev_t *vd)
 {
 	mutex_enter(&spa_l2cache_lock);
 	ASSERT(vd->vdev_isl2cache);
 	spa_aux_activate(vd, &spa_l2cache_avl);
 	mutex_exit(&spa_l2cache_lock);
 }
 
 /*
  * ==========================================================================
  * SPA vdev locking
  * ==========================================================================
  */
 
 /*
  * Lock the given spa_t for the purpose of adding or removing a vdev.
  * Grabs the global spa_namespace_lock plus the spa config lock for writing.
  * It returns the next transaction group for the spa_t.
  */
 uint64_t
 spa_vdev_enter(spa_t *spa)
 {
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 	return (spa_vdev_config_enter(spa));
 }
 
 /*
  * Internal implementation for spa_vdev_enter().  Used when a vdev
  * operation requires multiple syncs (i.e. removing a device) while
  * keeping the spa_namespace_lock held.
  */
 uint64_t
 spa_vdev_config_enter(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	return (spa_last_synced_txg(spa) + 1);
 }
 
 /*
  * Used in combination with spa_vdev_config_enter() to allow the syncing
  * of multiple transactions without releasing the spa_namespace_lock.
  */
 void
 spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	int config_changed = B_FALSE;
 
 	ASSERT(txg > spa_last_synced_txg(spa));
 
 	spa->spa_pending_vdev = NULL;
 
 	/*
 	 * Reassess the DTLs.
 	 */
 	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
 
 	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
 		config_changed = B_TRUE;
 		spa->spa_config_generation++;
 	}
 
 	/*
 	 * Verify the metaslab classes.
 	 */
 	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
 
 	spa_config_exit(spa, SCL_ALL, spa);
 
 	/*
 	 * Panic the system if the specified tag requires it.  This
 	 * is useful for ensuring that configurations are updated
 	 * transactionally.
 	 */
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, tag, 0);
 
 	/*
 	 * Note: this txg_wait_synced() is important because it ensures
 	 * that there won't be more than one config change per txg.
 	 * This allows us to use the txg as the generation number.
 	 */
 	if (error == 0)
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	if (vd != NULL) {
 		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
 		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
 		spa_config_exit(spa, SCL_ALL, spa);
 	}
 
 	/*
 	 * If the config changed, update the config cache.
 	 */
 	if (config_changed)
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 }
 
 /*
  * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
  * locking of spa_vdev_enter(), we also want make sure the transactions have
  * synced to disk, and then update the global configuration cache with the new
  * information.
  */
 int
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
 	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Lock the given spa_t for the purpose of changing vdev state.
  */
 void
 spa_vdev_state_enter(spa_t *spa, int oplocks)
 {
 	int locks = SCL_STATE_ALL | oplocks;
 
 	/*
 	 * Root pools may need to read of the underlying devfs filesystem
 	 * when opening up a vdev.  Unfortunately if we're holding the
 	 * SCL_ZIO lock it will result in a deadlock when we try to issue
 	 * the read from the root filesystem.  Instead we "prefetch"
 	 * the associated vnodes that we need prior to opening the
 	 * underlying devices and cache them so that we can prevent
 	 * any I/O when we are doing the actual open.
 	 */
 	if (spa_is_root(spa)) {
 		int low = locks & ~(SCL_ZIO - 1);
 		int high = locks & ~low;
 
 		spa_config_enter(spa, high, spa, RW_WRITER);
 		vdev_hold(spa->spa_root_vdev);
 		spa_config_enter(spa, low, spa, RW_WRITER);
 	} else {
 		spa_config_enter(spa, locks, spa, RW_WRITER);
 	}
 	spa->spa_vdev_locks = locks;
 }
 
 int
 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 {
 	boolean_t config_changed = B_FALSE;
 
 	if (vd != NULL || error == 0)
 		vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
 		    0, 0, B_FALSE);
 
 	if (vd != NULL) {
 		vdev_state_dirty(vd->vdev_top);
 		config_changed = B_TRUE;
 		spa->spa_config_generation++;
 	}
 
 	if (spa_is_root(spa))
 		vdev_rele(spa->spa_root_vdev);
 
 	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
 	spa_config_exit(spa, spa->spa_vdev_locks, spa);
 
 	/*
 	 * If anything changed, wait for it to sync.  This ensures that,
 	 * from the system administrator's perspective, zpool(1M) commands
 	 * are synchronous.  This is important for things like zpool offline:
 	 * when the command completes, you expect no further I/O from ZFS.
 	 */
 	if (vd != NULL)
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	/*
 	 * If the config changed, update the config cache.
 	 */
 	if (config_changed) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * Miscellaneous functions
  * ==========================================================================
  */
 
 void
 spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
 {
 	if (!nvlist_exists(spa->spa_label_features, feature)) {
 		fnvlist_add_boolean(spa->spa_label_features, feature);
 		/*
 		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
 		 * dirty the vdev config because lock SCL_CONFIG is not held.
 		 * Thankfully, in this case we don't need to dirty the config
 		 * because it will be written out anyway when we finish
 		 * creating the pool.
 		 */
 		if (tx->tx_txg != TXG_INITIAL)
 			vdev_config_dirty(spa->spa_root_vdev);
 	}
 }
 
 void
 spa_deactivate_mos_feature(spa_t *spa, const char *feature)
 {
 	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
 		vdev_config_dirty(spa->spa_root_vdev);
 }
 
 /*
  * Rename a spa_t.
  */
 int
 spa_rename(const char *name, const char *newname)
 {
 	spa_t *spa;
 	int err;
 
 	/*
 	 * Lookup the spa_t and grab the config lock for writing.  We need to
 	 * actually open the pool so that we can sync out the necessary labels.
 	 * It's OK to call spa_open() with the namespace lock held because we
 	 * allow recursive calls for other reasons.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if ((err = spa_open(name, &spa, FTAG)) != 0) {
 		mutex_exit(&spa_namespace_lock);
 		return (err);
 	}
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	avl_remove(&spa_namespace_avl, spa);
 	(void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
 	avl_add(&spa_namespace_avl, spa);
 
 	/*
 	 * Sync all labels to disk with the new names by marking the root vdev
 	 * dirty and waiting for it to sync.  It will pick up the new pool name
 	 * during the sync.
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	/*
 	 * Sync the updated config cache.
 	 */
 	spa_config_sync(spa, B_FALSE, B_TRUE);
 
 	spa_close(spa, FTAG);
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Return the spa_t associated with given pool_guid, if it exists.  If
  * device_guid is non-zero, determine whether the pool exists *and* contains
  * a device with the specified device_guid.
  */
 spa_t *
 spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
 {
 	spa_t *spa;
 	avl_tree_t *t = &spa_namespace_avl;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
 		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 			continue;
 		if (spa->spa_root_vdev == NULL)
 			continue;
 		if (spa_guid(spa) == pool_guid) {
 			if (device_guid == 0)
 				break;
 
 			if (vdev_lookup_by_guid(spa->spa_root_vdev,
 			    device_guid) != NULL)
 				break;
 
 			/*
 			 * Check any devices we may be in the process of adding.
 			 */
 			if (spa->spa_pending_vdev) {
 				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
 				    device_guid) != NULL)
 					break;
 			}
 		}
 	}
 
 	return (spa);
 }
 
 /*
  * Determine whether a pool with the given pool_guid exists.
  */
 boolean_t
 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
 {
 	return (spa_by_guid(pool_guid, device_guid) != NULL);
 }
 
 char *
 spa_strdup(const char *s)
 {
 	size_t len;
 	char *new;
 
 	len = strlen(s);
 	new = kmem_alloc(len + 1, KM_SLEEP);
 	bcopy(s, new, len);
 	new[len] = '\0';
 
 	return (new);
 }
 
 void
 spa_strfree(char *s)
 {
 	kmem_free(s, strlen(s) + 1);
 }
 
 uint64_t
 spa_get_random(uint64_t range)
 {
 	uint64_t r;
 
 	ASSERT(range != 0);
 
 	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
 
 	return (r % range);
 }
 
 uint64_t
 spa_generate_guid(spa_t *spa)
 {
 	uint64_t guid = spa_get_random(-1ULL);
 
 	if (spa != NULL) {
 		while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
 			guid = spa_get_random(-1ULL);
 	} else {
 		while (guid == 0 || spa_guid_exists(guid, 0))
 			guid = spa_get_random(-1ULL);
 	}
 
 	return (guid);
 }
 
 void
 snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
 {
 	char type[256];
 	char *checksum = NULL;
 	char *compress = NULL;
 
 	if (bp != NULL) {
 		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
 			dmu_object_byteswap_t bswap =
 			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
 			(void) snprintf(type, sizeof (type), "bswap %s %s",
 			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
 			    "metadata" : "data",
 			    dmu_ot_byteswap[bswap].ob_name);
 		} else {
 			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
 			    sizeof (type));
 		}
 		if (!BP_IS_EMBEDDED(bp)) {
 			checksum =
 			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
 		}
 		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
 	}
 
 	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
 	    compress);
 }
 
 void
 spa_freeze(spa_t *spa)
 {
 	uint64_t freeze_txg = 0;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	if (spa->spa_freeze_txg == UINT64_MAX) {
 		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
 		spa->spa_freeze_txg = freeze_txg;
 	}
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (freeze_txg != 0)
 		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
 }
 
 void
 zfs_panic_recover(const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
 	va_end(adx);
 }
 
 /*
  * This is a stripped-down version of strtoull, suitable only for converting
  * lowercase hexadecimal numbers that don't overflow.
  */
 uint64_t
 zfs_strtonum(const char *str, char **nptr)
 {
 	uint64_t val = 0;
 	char c;
 	int digit;
 
 	while ((c = *str) != '\0') {
 		if (c >= '0' && c <= '9')
 			digit = c - '0';
 		else if (c >= 'a' && c <= 'f')
 			digit = 10 + c - 'a';
 		else
 			break;
 
 		val *= 16;
 		val += digit;
 
 		str++;
 	}
 
 	if (nptr)
 		*nptr = (char *)str;
 
 	return (val);
 }
 
 /*
  * ==========================================================================
  * Accessor functions
  * ==========================================================================
  */
 
 boolean_t
 spa_shutting_down(spa_t *spa)
 {
 	return (spa->spa_async_suspended);
 }
 
 dsl_pool_t *
 spa_get_dsl(spa_t *spa)
 {
 	return (spa->spa_dsl_pool);
 }
 
 boolean_t
 spa_is_initializing(spa_t *spa)
 {
 	return (spa->spa_is_initializing);
 }
 
 blkptr_t *
 spa_get_rootblkptr(spa_t *spa)
 {
 	return (&spa->spa_ubsync.ub_rootbp);
 }
 
 void
 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
 {
 	spa->spa_uberblock.ub_rootbp = *bp;
 }
 
 void
 spa_altroot(spa_t *spa, char *buf, size_t buflen)
 {
 	if (spa->spa_root == NULL)
 		buf[0] = '\0';
 	else
 		(void) strncpy(buf, spa->spa_root, buflen);
 }
 
 int
 spa_sync_pass(spa_t *spa)
 {
 	return (spa->spa_sync_pass);
 }
 
 char *
 spa_name(spa_t *spa)
 {
 	return (spa->spa_name);
 }
 
 uint64_t
 spa_guid(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	uint64_t guid;
 
 	/*
 	 * If we fail to parse the config during spa_load(), we can go through
 	 * the error path (which posts an ereport) and end up here with no root
 	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
 	 * this case.
 	 */
 	if (spa->spa_root_vdev == NULL)
 		return (spa->spa_config_guid);
 
 	guid = spa->spa_last_synced_guid != 0 ?
 	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
 
 	/*
 	 * Return the most recently synced out guid unless we're
 	 * in syncing context.
 	 */
 	if (dp && dsl_pool_sync_context(dp))
 		return (spa->spa_root_vdev->vdev_guid);
 	else
 		return (guid);
 }
 
 uint64_t
 spa_load_guid(spa_t *spa)
 {
 	/*
 	 * This is a GUID that exists solely as a reference for the
 	 * purposes of the arc.  It is generated at load time, and
 	 * is never written to persistent storage.
 	 */
 	return (spa->spa_load_guid);
 }
 
 uint64_t
 spa_last_synced_txg(spa_t *spa)
 {
 	return (spa->spa_ubsync.ub_txg);
 }
 
 uint64_t
 spa_first_txg(spa_t *spa)
 {
 	return (spa->spa_first_txg);
 }
 
 uint64_t
 spa_syncing_txg(spa_t *spa)
 {
 	return (spa->spa_syncing_txg);
 }
 
 pool_state_t
 spa_state(spa_t *spa)
 {
 	return (spa->spa_state);
 }
 
 spa_load_state_t
 spa_load_state(spa_t *spa)
 {
 	return (spa->spa_load_state);
 }
 
 uint64_t
 spa_freeze_txg(spa_t *spa)
 {
 	return (spa->spa_freeze_txg);
 }
 
 /* ARGSUSED */
 uint64_t
 spa_get_asize(spa_t *spa, uint64_t lsize)
 {
 	return (lsize * spa_asize_inflation);
 }
 
 /*
  * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
  * or at least 32MB.
  *
  * See the comment above spa_slop_shift for details.
  */
 uint64_t
 spa_get_slop_space(spa_t *spa) {
 	uint64_t space = spa_get_dspace(spa);
 	return (MAX(space >> spa_slop_shift, SPA_MINDEVSIZE >> 1));
 }
 
 uint64_t
 spa_get_dspace(spa_t *spa)
 {
 	return (spa->spa_dspace);
 }
 
 void
 spa_update_dspace(spa_t *spa)
 {
 	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
 	    ddt_get_dedup_dspace(spa);
 }
 
 /*
  * Return the failure mode that has been set to this pool. The default
  * behavior will be to block all I/Os when a complete failure occurs.
  */
 uint8_t
 spa_get_failmode(spa_t *spa)
 {
 	return (spa->spa_failmode);
 }
 
 boolean_t
 spa_suspended(spa_t *spa)
 {
 	return (spa->spa_suspended);
 }
 
 uint64_t
 spa_version(spa_t *spa)
 {
 	return (spa->spa_ubsync.ub_version);
 }
 
 boolean_t
 spa_deflate(spa_t *spa)
 {
 	return (spa->spa_deflate);
 }
 
 metaslab_class_t *
 spa_normal_class(spa_t *spa)
 {
 	return (spa->spa_normal_class);
 }
 
 metaslab_class_t *
 spa_log_class(spa_t *spa)
 {
 	return (spa->spa_log_class);
 }
 
 void
 spa_evicting_os_register(spa_t *spa, objset_t *os)
 {
 	mutex_enter(&spa->spa_evicting_os_lock);
 	list_insert_head(&spa->spa_evicting_os_list, os);
 	mutex_exit(&spa->spa_evicting_os_lock);
 }
 
 void
 spa_evicting_os_deregister(spa_t *spa, objset_t *os)
 {
 	mutex_enter(&spa->spa_evicting_os_lock);
 	list_remove(&spa->spa_evicting_os_list, os);
 	cv_broadcast(&spa->spa_evicting_os_cv);
 	mutex_exit(&spa->spa_evicting_os_lock);
 }
 
 void
 spa_evicting_os_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_evicting_os_lock);
 	while (!list_is_empty(&spa->spa_evicting_os_list))
 		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
 	mutex_exit(&spa->spa_evicting_os_lock);
 
 	dmu_buf_user_evict_wait();
 }
 
 int
 spa_max_replication(spa_t *spa)
 {
 	/*
 	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
 	 * handle BPs with more than one DVA allocated.  Set our max
 	 * replication level accordingly.
 	 */
 	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
 		return (1);
 	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
 }
 
 int
 spa_prev_software_version(spa_t *spa)
 {
 	return (spa->spa_prev_software_version);
 }
 
 uint64_t
 spa_deadman_synctime(spa_t *spa)
 {
 	return (spa->spa_deadman_synctime);
 }
 
 uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
 	uint64_t asize = DVA_GET_ASIZE(dva);
 	uint64_t dsize = asize;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (asize != 0 && spa->spa_deflate) {
 		uint64_t vdev = DVA_GET_VDEV(dva);
 		vdev_t *vd = vdev_lookup_top(spa, vdev);
 		if (vd == NULL) {
 			panic(
 			    "dva_get_dsize_sync(): bad DVA %llu:%llu",
 			    (u_longlong_t)vdev, (u_longlong_t)asize);
 		}
 		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
 	}
 
 	return (dsize);
 }
 
 uint64_t
 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
 {
 	uint64_t dsize = 0;
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	return (dsize);
 }
 
 uint64_t
 bp_get_dsize(spa_t *spa, const blkptr_t *bp)
 {
 	uint64_t dsize = 0;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (dsize);
 }
 
 /*
  * ==========================================================================
  * Initialization and Termination
  * ==========================================================================
  */
 
 static int
 spa_name_compare(const void *a1, const void *a2)
 {
 	const spa_t *s1 = a1;
 	const spa_t *s2 = a2;
 	int s;
 
 	s = strcmp(s1->spa_name, s2->spa_name);
 	if (s > 0)
 		return (1);
 	if (s < 0)
 		return (-1);
 	return (0);
 }
 
 int
 spa_busy(void)
 {
 	return (spa_active_count);
 }
 
 void
 spa_boot_init()
 {
 	spa_config_load();
 }
 
 #ifdef _KERNEL
 EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
 #endif
 
 void
 spa_init(int mode)
 {
 	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
 
 	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
 	    offsetof(spa_t, spa_avl));
 
 	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
 	    offsetof(spa_aux_t, aux_avl));
 
 	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
 	    offsetof(spa_aux_t, aux_avl));
 
 	spa_mode_global = mode;
 
 #ifdef illumos
 #ifdef _KERNEL
 	spa_arch_init();
 #else
 	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
 		arc_procfd = open("/proc/self/ctl", O_WRONLY);
 		if (arc_procfd == -1) {
 			perror("could not enable watchpoints: "
 			    "opening /proc/self/ctl failed: ");
 		} else {
 			arc_watch = B_TRUE;
 		}
 	}
 #endif
 #endif /* illumos */
 	refcount_sysinit();
 	unique_init();
 	range_tree_init();
 	zio_init();
 	lz4_init();
 	dmu_init();
 	zil_init();
 	vdev_cache_stat_init();
 	zfs_prop_init();
 	zpool_prop_init();
 	zpool_feature_init();
 	spa_config_load();
 	l2arc_start();
 #ifndef illumos
 #ifdef _KERNEL
 	zfs_deadman_init();
 #endif
 #endif	/* !illumos */
 }
 
 void
 spa_fini(void)
 {
 	l2arc_stop();
 
 	spa_evict_all();
 
 	vdev_cache_stat_fini();
 	zil_fini();
 	dmu_fini();
 	lz4_fini();
 	zio_fini();
 	range_tree_fini();
 	unique_fini();
 	refcount_fini();
 
 	avl_destroy(&spa_namespace_avl);
 	avl_destroy(&spa_spare_avl);
 	avl_destroy(&spa_l2cache_avl);
 
 	cv_destroy(&spa_namespace_cv);
 	mutex_destroy(&spa_namespace_lock);
 	mutex_destroy(&spa_spare_lock);
 	mutex_destroy(&spa_l2cache_lock);
 }
 
 /*
  * Return whether this pool has slogs. No locking needed.
  * It's not a problem if the wrong answer is returned as it's only for
  * performance and not correctness
  */
 boolean_t
 spa_has_slogs(spa_t *spa)
 {
 	return (spa->spa_log_class->mc_rotor != NULL);
 }
 
 spa_log_state_t
 spa_get_log_state(spa_t *spa)
 {
 	return (spa->spa_log_state);
 }
 
 void
 spa_set_log_state(spa_t *spa, spa_log_state_t state)
 {
 	spa->spa_log_state = state;
 }
 
 boolean_t
 spa_is_root(spa_t *spa)
 {
 	return (spa->spa_is_root);
 }
 
 boolean_t
 spa_writeable(spa_t *spa)
 {
 	return (!!(spa->spa_mode & FWRITE));
 }
 
 /*
  * Returns true if there is a pending sync task in any of the current
  * syncing txg, the current quiescing txg, or the current open txg.
  */
 boolean_t
 spa_has_pending_synctask(spa_t *spa)
 {
 	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
 }
 
 int
 spa_mode(spa_t *spa)
 {
 	return (spa->spa_mode);
 }
 
 uint64_t
 spa_bootfs(spa_t *spa)
 {
 	return (spa->spa_bootfs);
 }
 
 uint64_t
 spa_delegation(spa_t *spa)
 {
 	return (spa->spa_delegation);
 }
 
 objset_t *
 spa_meta_objset(spa_t *spa)
 {
 	return (spa->spa_meta_objset);
 }
 
 enum zio_checksum
 spa_dedup_checksum(spa_t *spa)
 {
 	return (spa->spa_dedup_checksum);
 }
 
 /*
  * Reset pool scan stat per scan pass (or reboot).
  */
 void
 spa_scan_stat_init(spa_t *spa)
 {
 	/* data not stored on disk */
 	spa->spa_scan_pass_start = gethrestime_sec();
 	spa->spa_scan_pass_exam = 0;
 	vdev_scan_stat_init(spa->spa_root_vdev);
 }
 
 /*
  * Get scan stats for zpool status reports
  */
 int
 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 {
 	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
 
 	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOENT));
 	bzero(ps, sizeof (pool_scan_stat_t));
 
 	/* data stored on disk */
 	ps->pss_func = scn->scn_phys.scn_func;
 	ps->pss_start_time = scn->scn_phys.scn_start_time;
 	ps->pss_end_time = scn->scn_phys.scn_end_time;
 	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
 	ps->pss_examined = scn->scn_phys.scn_examined;
 	ps->pss_to_process = scn->scn_phys.scn_to_process;
 	ps->pss_processed = scn->scn_phys.scn_processed;
 	ps->pss_errors = scn->scn_phys.scn_errors;
 	ps->pss_state = scn->scn_phys.scn_state;
 
 	/* data not stored on disk */
 	ps->pss_pass_start = spa->spa_scan_pass_start;
 	ps->pss_pass_exam = spa->spa_scan_pass_exam;
 
 	return (0);
 }
 
 boolean_t
 spa_debug_enabled(spa_t *spa)
 {
 	return (spa->spa_debug);
 }
 
 int
 spa_maxblocksize(spa_t *spa)
 {
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
 		return (SPA_MAXBLOCKSIZE);
 	else
 		return (SPA_OLD_MAXBLOCKSIZE);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(revision 296519)
@@ -1,952 +1,953 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2013 DEY Storage Systems, Inc.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
  *
  * The DMU also interacts with the SPA.  That interface is described in
  * dmu_spa.h.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/cred.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_priority.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct uio;
 struct xuio;
 struct page;
 struct vnode;
 struct spa;
 struct zilog;
 struct zio;
 struct blkptr;
 struct zap_cursor;
 struct dsl_dataset;
 struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
 struct zbookmark_phys;
 struct spa;
 struct nvlist;
 struct arc_buf;
 struct zio_prop;
 struct sa_handle;
 struct file;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 
 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
 	DMU_BSWAP_UINT16,
 	DMU_BSWAP_UINT32,
 	DMU_BSWAP_UINT64,
 	DMU_BSWAP_ZAP,
 	DMU_BSWAP_DNODE,
 	DMU_BSWAP_OBJSET,
 	DMU_BSWAP_ZNODE,
 	DMU_BSWAP_OLDACL,
 	DMU_BSWAP_ACL,
 	/*
 	 * Allocating a new byteswap type number makes the on-disk format
 	 * incompatible with any other format that uses the same number.
 	 *
 	 * Data can usually be structured to work with one of the
 	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
 	 */
 	DMU_BSWAP_NUMFUNCS
 } dmu_object_byteswap_t;
 
 #define	DMU_OT_NEWTYPE 0x80
 #define	DMU_OT_METADATA 0x40
 #define	DMU_OT_BYTESWAP_MASK 0x3f
 
 /*
  * Defines a uint8_t object type. Object types specify if the data
  * in the object is metadata (boolean) and how to byteswap the data
  * (dmu_object_byteswap_t).
  */
 #define	DMU_OT(byteswap, metadata) \
 	(DMU_OT_NEWTYPE | \
 	((metadata) ? DMU_OT_METADATA : 0) | \
 	((byteswap) & DMU_OT_BYTESWAP_MASK))
 
 #define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
 	(ot) < DMU_OT_NUMTYPES)
 
 #define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_METADATA) : \
 	dmu_ot[(ot)].ot_metadata)
 
 /*
  * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
  * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
  * is repurposed for embedded BPs.
  */
 #define	DMU_OT_HAS_FILL(ot) \
 	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
 
 #define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) : \
 	dmu_ot[(ot)].ot_byteswap)
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
 	/* general: */
 	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
 	DMU_OT_BPOBJ,			/* UINT64 */
 	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
 	/* zil: */
 	DMU_OT_INTENT_LOG,		/* UINT64 */
 	/* dmu: */
 	DMU_OT_DNODE,			/* DNODE */
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DIR,			/* UINT64 */
 	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
 	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
 	DMU_OT_OLDACL,			/* Old ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
 	DMU_OT_UNLINKED_SET,		/* ZAP */
 	/* zvol: */
 	DMU_OT_ZVOL,			/* UINT8 */
 	DMU_OT_ZVOL_PROP,		/* ZAP */
 	/* other; for testing only! */
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
 	/* new object types: */
 	DMU_OT_ERROR_LOG,		/* ZAP */
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
 	DMU_OT_DSL_PERMS,		/* ZAP */
 	DMU_OT_ACL,			/* ACL */
 	DMU_OT_SYSACL,			/* SYSACL */
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
 	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
 	DMU_OT_DDT_ZAP,			/* ZAP */
 	DMU_OT_DDT_STATS,		/* ZAP */
 	DMU_OT_SA,			/* System attr */
 	DMU_OT_SA_MASTER_NODE,		/* ZAP */
 	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
 	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
 	DMU_OT_SCAN_XLATE,		/* ZAP */
 	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
 	DMU_OT_DEADLIST,		/* ZAP */
 	DMU_OT_DEADLIST_HDR,		/* UINT64 */
 	DMU_OT_DSL_CLONES,		/* ZAP */
 	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
 	/*
 	 * Do not allocate new object types here. Doing so makes the on-disk
 	 * format incompatible with any other format that uses the same object
 	 * type number.
 	 *
 	 * When creating an object which does not have one of the above types
 	 * use the DMU_OTN_* type with the correct byteswap and metadata
 	 * values.
 	 *
 	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
 	 * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
 	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
 	 * and DMU_OTN_* types).
 	 */
 	DMU_OT_NUMTYPES,
 
 	/*
 	 * Names for valid types declared with DMU_OT().
 	 */
 	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
 	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
 	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
 	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
 	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
 	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
 	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
 	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
 	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
 	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
 } dmu_object_type_t;
 
 typedef enum txg_how {
 	TXG_WAIT = 1,
 	TXG_NOWAIT,
 	TXG_WAITED,
 } txg_how_t;
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
 void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 #define	DS_FIND_SERIALIZE	(1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
 
 /*
  * artificial blkids for bonus buffer and spill blocks
  */
 #define	DMU_BONUS_BLKID		(-1ULL)
 #define	DMU_SPILL_BLKID		(-2ULL)
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
 
 void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
 int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
     struct nvlist *snaps);
 int dmu_objset_clone(const char *name, const char *origin);
 int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
     struct nvlist *errlist);
 int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
 int dmu_objset_snapshot_tmp(const char *, const char *, int);
 int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
 	uint64_t db_offset;		/* byte offset in this object */
 	uint64_t db_size;		/* size of buffer in bytes */
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
 #define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
 #define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
 #define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
 #define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
  *
  * The transaction must be assigned to a txg.  The newly allocated
  * object will be "held" in the transaction (ie. you can modify the
  * newly allocated object in this transaction).
  *
  * dmu_object_alloc() chooses an object and returns it in *objectp.
  *
  * dmu_object_claim() allocates a specific object number.  If that
  * number is already allocated, it fails and returns EEXIST.
  *
  * Return 0 on success, or ENOSPC or EEXIST as specified above.
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 
 /*
  * Free an object from this objset.
  *
  * The object's data will be freed as well (ie. you don't need to call
  * dmu_free(object, 0, -1, tx)).
  *
  * The object need not be held in the transaction.
  *
  * If there are any holds on this object's buffers (via dmu_buf_hold()),
  * or tx holds on the object (via dmu_tx_hold_object()), you can not
  * free it; it fails and returns EBUSY.
  *
  * If the object is not allocated, it fails and returns ENOENT.
  *
  * Return 0 on success, or EBUSY or ENOENT as specified above.
  */
 int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Find the next allocated or free object.
  *
  * The objectp parameter is in-out.  It will be updated to be the next
  * object which is allocated.  Ignore objects which have not been
  * modified since txg.
  *
  * XXX Can only be called on a objset with no dirty data.
  *
  * Returns 0 on success, or ENOENT if there are no more objects.
  */
 int dmu_object_next(objset_t *os, uint64_t *objectp,
     boolean_t hole, uint64_t txg);
 
 /*
  * Set the data blocksize for an object.
  *
  * The object cannot have any blocks allcated beyond the first.  If
  * the first block is allocated already, the new size must be greater
  * than the current block size.  If these conditions are not met,
  * ENOTSUP will be returned.
  *
  * Returns 0 on success, or EBUSY if there are any holds on the object
  * contents, or ENOTSUP as described above.
  */
 int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
     int ibs, dmu_tx_t *tx);
 
 /*
  * Set the checksum property on a dnode.  The new checksum algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx);
 
 /*
  * Set the compress property on a dnode.  The new compression algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx);
 
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
 #define	WP_NOFILL	0x1
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4
 
 void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
     struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
  * data.  As with any normal buffer, you must call dmu_buf_read() to
  * read db_data, dmu_buf_will_dirty() before modifying it, and the
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release your hold with dmu_buf_rele().
  *
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
 dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
 int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 
 /*
  * Special spill buffer support used by "SA" framework
  */
 
 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
     void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
  * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
  * that it will remain in memory.  You must release the hold with
  * dmu_buf_rele().  You musn't access the dmu_buf_t after releasing your
  * hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
  *
  * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
  * on the returned buffer before reading or writing the buffer's
  * db_data.  The comments for those routines describe what particular
  * operations are valid after calling them.
  *
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **, int flags);
 
 /*
  * Add a reference to a dmu buffer that has already been held via
  * dmu_buf_hold() in the current context.
  */
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
 
 /*
  * Attempt to add a reference to a dmu buffer that is in an unknown state,
  * using a pointer that may have been invalidated by eviction processing.
  * The request will succeed if the passed in dbuf still represents the
  * same os/object/blkid, is ineligible for eviction, and has at least
  * one hold by a user other than the syncer.
  */
 boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
     uint64_t blkid, void *tag);
 
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
 /*
  * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
  * range of an object.  A pointer to an array of dmu_buf_t*'s is
  * returned (in *dbpp).
  *
  * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
  * frees the array.  The hold on the array of buffers MUST be released
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, boolean_t read, void *tag,
     int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
 typedef void dmu_buf_evict_func_t(void *user_ptr);
 
 /*
  * A DMU buffer user object may be associated with a dbuf for the
  * duration of its lifetime.  This allows the user of a dbuf (client)
  * to attach private data to a dbuf (e.g. in-core only data such as a
  * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
  * when that dbuf has been evicted.  Clients typically respond to the
  * eviction notification by freeing their private data, thus ensuring
  * the same lifetime for both dbuf and private data.
  *
  * The mapping from a dmu_buf_user_t to any client private data is the
  * client's responsibility.  All current consumers of the API with private
  * data embed a dmu_buf_user_t as the first member of the structure for
  * their private data.  This allows conversions between the two types
  * with a simple cast.  Since the DMU buf user API never needs access
  * to the private data, other strategies can be employed if necessary
  * or convenient for the client (e.g. using container_of() to do the
  * conversion for private data that cannot have the dmu_buf_user_t as
  * its first member).
  *
  * Eviction callbacks are executed without the dbuf mutex held or any
  * other type of mechanism to guarantee that the dbuf is still available.
  * For this reason, users must assume the dbuf has already been freed
  * and not reference the dbuf from the callback context.
  *
  * Users requesting "immediate eviction" are notified as soon as the dbuf
  * is only referenced by dirty records (dirties == holds).  Otherwise the
  * notification occurs after eviction processing for the dbuf begins.
  */
 typedef struct dmu_buf_user {
 	/*
 	 * Asynchronous user eviction callback state.
 	 */
 	taskq_ent_t	dbu_tqent;
 
 	/* This instance's eviction function pointer. */
 	dmu_buf_evict_func_t *dbu_evict_func;
 #ifdef ZFS_DEBUG
 	/*
 	 * Pointer to user's dbuf pointer.  NULL for clients that do
 	 * not associate a dbuf with their user data.
 	 *
 	 * The dbuf pointer is cleared upon eviction so as to catch
 	 * use-after-evict bugs in clients.
 	 */
 	dmu_buf_t **dbu_clear_on_evict_dbufp;
 #endif
 } dmu_buf_user_t;
 
 /*
  * Initialize the given dmu_buf_user_t instance with the eviction function
  * evict_func, to be called when the user is evicted.
  *
  * NOTE: This function should only be called once on a given dmu_buf_user_t.
  *       To allow enforcement of this, dbu must already be zeroed on entry.
  */
 #ifdef __lint
 /* Very ugly, but it beats issuing suppression directives in many Makefiles. */
 extern void
 dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
     dmu_buf_t **clear_on_evict_dbufp);
 #else /* __lint */
 inline void
 dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
     dmu_buf_t **clear_on_evict_dbufp)
 {
 	ASSERT(dbu->dbu_evict_func == NULL);
 	ASSERT(evict_func != NULL);
 	dbu->dbu_evict_func = evict_func;
 #ifdef ZFS_DEBUG
 	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
 #endif
 }
 #endif /* __lint */
 
 /*
  * Attach user data to a dbuf and mark it for normal (when the dbuf's
  * data is cleared or its reference count goes to zero) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Attach user data to a dbuf and mark it for immediate (its dirty and
  * reference counts are equal) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Replace the current user of a dbuf.
  *
  * If given the current user of a dbuf, replaces the dbuf's user with
  * "new_user" and returns the user data pointer that was replaced.
  * Otherwise returns the current, and unmodified, dbuf user pointer.
  */
 void *dmu_buf_replace_user(dmu_buf_t *db,
     dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
 
 /*
  * Remove the specified user data for a DMU buffer.
  *
  * Returns the user that was removed on success, or the current user if
  * another user currently owns the buffer.
  */
 void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
 
 /*
  * Returns the blkptr associated with this dbuf, or NULL if not set.
  */
 struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
 
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
  * dmu_tx_assign()).  The buffer's object must be held in the tx
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 
 /*
  * Tells if the given dbuf is freeable.
  */
 boolean_t dmu_buf_freeable(dmu_buf_t *);
 
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
  * been assigned, you can modify buffers which belong to held objects as
  * part of this transaction.  You can't modify buffers before the
  * transaction has been assigned; you can't modify buffers which don't
  * belong to objects which this transaction holds; you can't hold
  * objects once the transaction has been assigned.  You may hold an
  * object which you are going to free (with dmu_object_free()), but you
  * don't have to.
  *
  * You can abort the transaction before it has been assigned.
  *
  * Note that you may hold buffers (with dmu_buf_hold) at any time,
  * regardless of transaction state.
  */
 
 #define	DMU_NEW_OBJECT	(-1ULL)
 #define	DMU_OBJECT_END	(-1ULL)
 
 dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_mark_netfree(dmu_tx_t *tx);
 
 /*
  * To register a commit callback, dmu_tx_callback_register() must be called.
  *
  * dcb_data is a pointer to caller private data that is passed on as a
  * callback parameter. The caller is responsible for properly allocating and
  * freeing it.
  *
  * When registering a callback, the transaction must be already created, but
  * it cannot be committed or aborted. It can be assigned to a txg or not.
  *
  * The callback will be called after the transaction has been safely written
  * to stable storage and will also be called if the dmu_tx is aborted.
  * If there is any error which prevents the transaction from being committed to
  * disk, the callback will be called with a value of error != 0.
  */
 typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
 
 void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
     void *dcb_data);
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * -1, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
 int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size);
 int dmu_free_long_object(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
  *
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
 #define	DMU_READ_PREFETCH	0 /* prefetch */
 #define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf, uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 #ifdef _KERNEL
 #ifdef illumos
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
 #else
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
 #endif
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
     dmu_tx_t *tx);
 int dmu_xuio_init(struct xuio *uio, int niov);
 void dmu_xuio_fini(struct xuio *uio);
 int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
     size_t n);
 int dmu_xuio_cnt(struct xuio *uio);
 struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
 void dmu_xuio_clear(struct xuio *uio, int i);
 void xuio_stat_wbuf_copied();
 void xuio_stat_wbuf_nocopy();
 
 extern boolean_t zfs_prefetch_disable;
 extern int zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
 	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_nblkptr;
 	uint8_t doi_pad[4];
 	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
 	uint64_t doi_max_offset;
 	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void arc_byteswap_func_t(void *buf, size_t size);
 
 typedef struct dmu_object_type_info {
 	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
 	char			*ot_name;
 } dmu_object_type_info_t;
 
 typedef struct dmu_object_byteswap_info {
 	arc_byteswap_func_t	*ob_func;
 	char			*ob_name;
 } dmu_object_byteswap_info_t;
 
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
 extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 
 /*
  * Get information on a DMU object.
  *
  * Return 0 on success or ENOENT if object is not allocated.
  *
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
 void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*
  * Like dmu_object_info_from_db, but faster still when you only care about
  * the size.  This is specifically optimized for zfs_getattr().
  */
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
 	uint64_t dds_guid;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
 	char dds_origin[MAXNAMELEN];
 } dmu_objset_stats_t;
 
 /*
  * Get stats on a dataset.
  */
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 
 /*
  * Add entries to the nvlist for all the objset's properties.  See
  * zfs_prop_table[] and zfs(1m) for details on the properties.
  */
 void dmu_objset_stats(objset_t *os, struct nvlist *nv);
 
 /*
  * Get the space usage statistics for statvfs().
  *
  * refdbytes is the amount of space "referenced" by this objset.
  * availbytes is the amount of space available to this objset, taking
  * into account quotas & reservations, assuming that no other objsets
  * use the space first.  These values correspond to the 'referenced' and
  * 'available' properties, described in the zfs(1m) manpage.
  *
  * usedobjs and availobjs are the number of objects currently allocated,
  * and available.
  */
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 
 /*
  * The fsid_guid is a 56-bit ID that can change to avoid collisions.
  * (Contrast with the ds_guid which is a 64-bit ID that will never
  * change, so there is a small probability that it will collide.)
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
 /*
  * Get the [cm]time for an objset's snapshot dir
  */
 timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
 extern struct zilog *dmu_objset_zil(objset_t *os);
 extern struct dsl_pool *dmu_objset_pool(objset_t *os);
 extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
 extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
     int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
 typedef int objset_used_cb_t(dmu_object_type_t bonustype,
     void *bonus, uint64_t *userp, uint64_t *groupp);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
     objset_used_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
 extern void *dmu_objset_get_user(objset_t *os);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
  * In the absence of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
 
 /*
  * {zfs,zvol,ztest}_get_done() args
  */
 typedef struct zgd {
 	struct zilog	*zgd_zilog;
 	struct blkptr	*zgd_bp;
 	dmu_buf_t	*zgd_db;
 	struct rl	*zgd_rl;
 	void		*zgd_private;
 } zgd_t;
 
 typedef void dmu_sync_cb_t(zgd_t *arg, int error);
 int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
  * Return found offset in *off. Return ESRCH for end of file.
  */
 int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
     uint64_t *off);
 
 /*
  * Check if a DMU object has any dirty blocks. If so, sync out
  * all pending transaction groups. Otherwise, this function
  * does not alter DMU state. This could be improved to only sync
  * out the necessary transaction groups for this particular
  * object.
  */
 int dmu_object_wait_synced(objset_t *os, uint64_t object);
 
 /*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
 extern void dmu_fini(void);
 
 typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
     uint64_t object, uint64_t offset, int len);
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
     struct file *fp, offset_t *offp);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
 extern int zfs_mdcomp_disable;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_DMU_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	(revision 296519)
@@ -1,187 +1,188 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_DMU_OBJSET_H
 #define	_SYS_DMU_OBJSET_H
 
 #include <sys/spa.h>
 #include <sys/arc.h>
 #include <sys/txg.h>
 #include <sys/zfs_context.h>
 #include <sys/dnode.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sa.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 extern krwlock_t os_lock;
 
 struct dsl_pool;
 struct dsl_dataset;
 struct dmu_tx;
 
 #define	OBJSET_PHYS_SIZE 2048
 #define	OBJSET_OLD_PHYS_SIZE 1024
 
 #define	OBJSET_BUF_HAS_USERUSED(buf) \
 	(arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
 
 #define	OBJSET_FLAG_USERACCOUNTING_COMPLETE	(1ULL<<0)
 
 typedef struct objset_phys {
 	dnode_phys_t os_meta_dnode;
 	zil_header_t os_zil_header;
 	uint64_t os_type;
 	uint64_t os_flags;
 	char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
 	    sizeof (zil_header_t) - sizeof (uint64_t)*2];
 	dnode_phys_t os_userused_dnode;
 	dnode_phys_t os_groupused_dnode;
 } objset_phys_t;
 
 struct objset {
 	/* Immutable: */
 	struct dsl_dataset *os_dsl_dataset;
 	spa_t *os_spa;
 	arc_buf_t *os_phys_buf;
 	objset_phys_t *os_phys;
 	/*
 	 * The following "special" dnodes have no parent, are exempt
 	 * from dnode_move(), and are not recorded in os_dnodes, but they
 	 * root their descendents in this objset using handles anyway, so
 	 * that all access to dnodes from dbufs consistently uses handles.
 	 */
 	dnode_handle_t os_meta_dnode;
 	dnode_handle_t os_userused_dnode;
 	dnode_handle_t os_groupused_dnode;
 	zilog_t *os_zil;
 
 	list_node_t os_evicting_node;
 
 	/* can change, under dsl_dir's locks: */
 	enum zio_checksum os_checksum;
 	enum zio_compress os_compress;
 	uint8_t os_copies;
 	enum zio_checksum os_dedup_checksum;
 	boolean_t os_dedup_verify;
 	zfs_logbias_op_t os_logbias;
 	zfs_cache_type_t os_primary_cache;
 	zfs_cache_type_t os_secondary_cache;
 	zfs_sync_type_t os_sync;
 	zfs_redundant_metadata_type_t os_redundant_metadata;
 	int os_recordsize;
 
 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
 	blkptr_t *os_rootbp;
 	zil_header_t os_zil_header;
 	list_t os_synced_dnodes;
 	uint64_t os_flags;
 
 	/* Protected by os_obj_lock */
 	kmutex_t os_obj_lock;
 	uint64_t os_obj_next;
 
 	/* Protected by os_lock */
 	kmutex_t os_lock;
 	list_t os_dirty_dnodes[TXG_SIZE];
 	list_t os_free_dnodes[TXG_SIZE];
 	list_t os_dnodes;
 	list_t os_downgraded_dbufs;
 
 	/* stuff we store for the user */
 	kmutex_t os_user_ptr_lock;
 	void *os_user_ptr;
 	sa_os_t *os_sa;
 };
 
 #define	DMU_META_OBJSET		0
 #define	DMU_META_DNODE_OBJECT	0
 #define	DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
 #define	DMU_META_DNODE(os)	((os)->os_meta_dnode.dnh_dnode)
 #define	DMU_USERUSED_DNODE(os)	((os)->os_userused_dnode.dnh_dnode)
 #define	DMU_GROUPUSED_DNODE(os)	((os)->os_groupused_dnode.dnh_dnode)
 
 #define	DMU_OS_IS_L2CACHEABLE(os)				\
 	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
 	(os)->os_secondary_cache == ZFS_CACHE_METADATA)
 
 #define	DMU_OS_IS_L2COMPRESSIBLE(os)	(zfs_mdcomp_disable == B_FALSE)
 
 /* called from zpl */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
 int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
     dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
 void dmu_objset_refresh_ownership(objset_t *os, void *tag);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
 
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
     int func(struct dsl_pool *, struct dsl_dataset *, void *),
     void *arg, int flags);
 int dmu_objset_prefetch(const char *name, void *arg);
 void dmu_objset_evict_dbufs(objset_t *os);
 timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 /* called from dsl */
 void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
 boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
 objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
     blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
     objset_t **osp);
 void dmu_objset_evict(objset_t *os);
 void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
 void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
 boolean_t dmu_objset_userused_enabled(objset_t *os);
 int dmu_objset_userspace_upgrade(objset_t *os);
 boolean_t dmu_objset_userspace_present(objset_t *os);
 int dmu_fsname(const char *snapname, char *buf);
 
 void dmu_objset_evict_done(objset_t *os);
 
 void dmu_objset_init(void);
 void dmu_objset_fini(void);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DMU_OBJSET_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h	(revision 296519)
@@ -1,90 +1,91 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef _DMU_SEND_H
 #define	_DMU_SEND_H
 
 #include <sys/spa.h>
 
 struct vnode;
 struct dsl_dataset;
 struct drr_begin;
 struct avl_tree;
 struct dmu_replay_record;
 
 extern const char *recv_clone_name;
 
 int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
     boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
 #ifdef illumos
     struct vnode *vp, offset_t *off);
 #else
     struct file *fp, offset_t *off);
 #endif
 int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
     uint64_t *sizep);
 int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
     uint64_t *sizep);
 int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
     boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, struct vnode *vp, offset_t *off);
 #else
     int outfd, struct file *fp, offset_t *off);
 #endif
 
 typedef struct dmu_recv_cookie {
 	struct dsl_dataset *drc_ds;
 	struct dmu_replay_record *drc_drr_begin;
 	struct drr_begin *drc_drrb;
 	const char *drc_tofs;
 	const char *drc_tosnap;
 	boolean_t drc_newfs;
 	boolean_t drc_byteswap;
 	boolean_t drc_force;
 	boolean_t drc_resumable;
 	struct avl_tree *drc_guid_to_ds_map;
 	zio_cksum_t drc_cksum;
 	uint64_t drc_newsnapobj;
 	void *drc_owner;
 	cred_t *drc_cred;
 } dmu_recv_cookie_t;
 
 int dmu_recv_begin(char *tofs, char *tosnap,
     struct dmu_replay_record *drr_begin,
     boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc);
 #ifdef illumos
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
 #else
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
 #endif
     int cleanup_fd, uint64_t *action_handlep);
 int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
 boolean_t dmu_objset_is_receiving(objset_t *os);
 
 #endif /* _DMU_SEND_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	(revision 296519)
@@ -1,363 +1,364 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_DSL_DATASET_H
 #define	_SYS_DSL_DATASET_H
 
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/zio.h>
 #include <sys/bplist.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/refcount.h>
 #include <zfeature_common.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct dsl_dataset;
 struct dsl_dir;
 struct dsl_pool;
 
 #define	DS_FLAG_INCONSISTENT	(1ULL<<0)
 #define	DS_IS_INCONSISTENT(ds)	\
 	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
 
 /*
  * Do not allow this dataset to be promoted.
  */
 #define	DS_FLAG_NOPROMOTE	(1ULL<<1)
 
 /*
  * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
  * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
  * refquota/refreservations).
  */
 #define	DS_FLAG_UNIQUE_ACCURATE	(1ULL<<2)
 
 /*
  * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
  * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
  */
 #define	DS_FLAG_DEFER_DESTROY	(1ULL<<3)
 #define	DS_IS_DEFER_DESTROY(ds)	\
 	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
 
 /*
  * DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
  * They should be of the format <reverse-dns>:<field>.
  */
 
 /*
  * This field's value is the object ID of a zap object which contains the
  * bookmarks of this dataset.  If it is present, then this dataset is counted
  * in the refcount of the SPA_FEATURES_BOOKMARKS feature.
  */
 #define	DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
 
 /*
  * This field is present (with value=0) if this dataset may contain large
  * blocks (>128KB).  If it is present, then this dataset
  * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
  */
 #define	DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
 
 /*
  * These fields are set on datasets that are in the middle of a resumable
  * receive, and allow the sender to resume the send if it is interrupted.
  */
 #define	DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
 #define	DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
 #define	DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
 #define	DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
 #define	DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
 #define	DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
 #define	DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
 
 /*
  * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
  * name lookups should be performed case-insensitively.
  */
 #define	DS_FLAG_CI_DATASET	(1ULL<<16)
 
 #define	DS_CREATE_FLAG_NODIRTY	(1ULL<<24)
 
 typedef struct dsl_dataset_phys {
 	uint64_t ds_dir_obj;		/* DMU_OT_DSL_DIR */
 	uint64_t ds_prev_snap_obj;	/* DMU_OT_DSL_DATASET */
 	uint64_t ds_prev_snap_txg;
 	uint64_t ds_next_snap_obj;	/* DMU_OT_DSL_DATASET */
 	uint64_t ds_snapnames_zapobj;	/* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
 	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
 	uint64_t ds_creation_time;	/* seconds since 1970 */
 	uint64_t ds_creation_txg;
 	uint64_t ds_deadlist_obj;	/* DMU_OT_DEADLIST */
 	/*
 	 * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes
 	 * include all blocks referenced by this dataset, including those
 	 * shared with any other datasets.
 	 */
 	uint64_t ds_referenced_bytes;
 	uint64_t ds_compressed_bytes;
 	uint64_t ds_uncompressed_bytes;
 	uint64_t ds_unique_bytes;	/* only relevant to snapshots */
 	/*
 	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
 	 * collisions.  The ds_guid is a 64-bit ID that will never
 	 * change, so there is a small probability that it will collide.
 	 */
 	uint64_t ds_fsid_guid;
 	uint64_t ds_guid;
 	uint64_t ds_flags;		/* DS_FLAG_* */
 	blkptr_t ds_bp;
 	uint64_t ds_next_clones_obj;	/* DMU_OT_DSL_CLONES */
 	uint64_t ds_props_obj;		/* DMU_OT_DSL_PROPS for snaps */
 	uint64_t ds_userrefs_obj;	/* DMU_OT_USERREFS */
 	uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
 	dmu_buf_user_t ds_dbu;
 
 	/* Immutable: */
 	struct dsl_dir *ds_dir;
 	dmu_buf_t *ds_dbuf;
 	uint64_t ds_object;
 	uint64_t ds_fsid_guid;
 	boolean_t ds_is_snapshot;
 
 	/* only used in syncing context, only valid for non-snapshots: */
 	struct dsl_dataset *ds_prev;
 	uint64_t ds_bookmarks;  /* DMU_OTN_ZAP_METADATA */
 
 	/* has internal locking: */
 	dsl_deadlist_t ds_deadlist;
 	bplist_t ds_pending_deadlist;
 
 	/* protected by lock on pool's dp_dirty_datasets list */
 	txg_node_t ds_dirty_link;
 	list_node_t ds_synced_link;
 
 	/*
 	 * ds_phys->ds_<accounting> is also protected by ds_lock.
 	 * Protected by ds_lock:
 	 */
 	kmutex_t ds_lock;
 	objset_t *ds_objset;
 	uint64_t ds_userrefs;
 	void *ds_owner;
 
 	/*
 	 * Long holds prevent the ds from being destroyed; they allow the
 	 * ds to remain held even after dropping the dp_config_rwlock.
 	 * Owning counts as a long hold.  See the comments above
 	 * dsl_pool_hold() for details.
 	 */
 	refcount_t ds_longholds;
 
 	/* no locking; only for making guesses */
 	uint64_t ds_trysnap_txg;
 
 	/* for objset_open() */
 	kmutex_t ds_opening_lock;
 
 	uint64_t ds_reserved;	/* cached refreservation */
 	uint64_t ds_quota;	/* cached refquota */
 
 	kmutex_t ds_sendstream_lock;
 	list_t ds_sendstreams;
 
 	/*
 	 * When in the middle of a resumable receive, tracks how much
 	 * progress we have made.
 	 */
 	uint64_t ds_resume_object[TXG_SIZE];
 	uint64_t ds_resume_offset[TXG_SIZE];
 	uint64_t ds_resume_bytes[TXG_SIZE];
 
 	/* Protected by our dsl_dir's dd_lock */
 	list_t ds_prop_cbs;
 
 	/*
 	 * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
 	 * uses this feature.
 	 */
 	uint8_t ds_feature_inuse[SPA_FEATURES];
 
 	/*
 	 * Set if we need to activate the feature on this dataset this txg
 	 * (used only in syncing context).
 	 */
 	uint8_t ds_feature_activation_needed[SPA_FEATURES];
 
 	/* Protected by ds_lock; keep at end of struct for better locality */
 	char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
 
 inline dsl_dataset_phys_t *
 dsl_dataset_phys(dsl_dataset_t *ds)
 {
 	return (ds->ds_dbuf->db_data);
 }
 
 /*
  * The max length of a temporary tag prefix is the number of hex digits
  * required to express UINT64_MAX plus one for the hyphen.
  */
 #define	MAX_TAG_PREFIX_LEN	17
 
 #define	dsl_dataset_is_snapshot(ds) \
 	(dsl_dataset_phys(ds)->ds_num_children != 0)
 
 #define	DS_UNIQUE_IS_ACCURATE(ds)	\
 	((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
 
 int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
     dsl_dataset_t **dsp);
 boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds,
     void *tag);
 int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **);
 void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
 int dsl_dataset_own(struct dsl_pool *dp, const char *name,
     void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
     void *tag, dsl_dataset_t **dsp);
 void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx);
 int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors);
 int dsl_dataset_promote(const char *name, char *conflsnap);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
     boolean_t force);
 int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
     minor_t cleanup_minor, const char *htag);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 
 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
 
 boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
     dsl_dataset_t *snap);
 
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx);
 int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx, boolean_t async);
 boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
     uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
 void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
 void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
 void dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
 int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
 int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used,
     uint64_t *ref_rsrv);
 int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
     uint64_t quota);
 int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
     uint64_t reservation);
 
 boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
     uint64_t earlier_txg);
 void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
 
 int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx);
 void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, dmu_tx_t *tx);
 int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr);
 void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx);
 
 void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
     dmu_tx_t *tx);
 void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
 int dsl_dataset_get_snapname(dsl_dataset_t *ds);
 int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
     uint64_t *value);
 int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
     boolean_t adj_cnt);
 void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
     zprop_source_t source, uint64_t value, dmu_tx_t *tx);
 void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
 boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
 boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
 int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);
 
 void dsl_dataset_deactivate_feature(uint64_t dsobj,
     spa_feature_t f, dmu_tx_t *tx);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
 	dsl_dataset_name(ds, __ds_name); \
 	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
 	kmem_free(__ds_name, MAXNAMELEN); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
 #define	dprintf_ds(dd, fmt, ...)
 #endif
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DSL_DATASET_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	(revision 296519)
@@ -1,922 +1,923 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef _SYS_SPA_H
 #define	_SYS_SPA_H
 
 #include <sys/avl.h>
 #include <sys/zfs_context.h>
 #include <sys/nvpair.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Forward references that lots of things need.
  */
 typedef struct spa spa_t;
 typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
 typedef struct metaslab_group metaslab_group_t;
 typedef struct metaslab_class metaslab_class_t;
 typedef struct zio zio_t;
 typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
 typedef struct ddt ddt_t;
 typedef struct ddt_entry ddt_entry_t;
 struct dsl_pool;
 struct dsl_dataset;
 
 /*
  * General-purpose 32-bit and 64-bit bitfield encodings.
  */
 #define	BF32_DECODE(x, low, len)	P2PHASE((x) >> (low), 1U << (len))
 #define	BF64_DECODE(x, low, len)	P2PHASE((x) >> (low), 1ULL << (len))
 #define	BF32_ENCODE(x, low, len)	(P2PHASE((x), 1U << (len)) << (low))
 #define	BF64_ENCODE(x, low, len)	(P2PHASE((x), 1ULL << (len)) << (low))
 
 #define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
 #define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
 
 #define	BF32_SET(x, low, len, val) do { \
 	ASSERT3U(val, <, 1U << (len)); \
 	ASSERT3U(low + len, <=, 32); \
 	(x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BF64_SET(x, low, len, val) do { \
 	ASSERT3U(val, <, 1ULL << (len)); \
 	ASSERT3U(low + len, <=, 64); \
 	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BF32_GET_SB(x, low, len, shift, bias)	\
 	((BF32_GET(x, low, len) + (bias)) << (shift))
 #define	BF64_GET_SB(x, low, len, shift, bias)	\
 	((BF64_GET(x, low, len) + (bias)) << (shift))
 
 #define	BF32_SET_SB(x, low, len, shift, bias, val) do { \
 	ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
 	ASSERT3S((val) >> (shift), >=, bias); \
 	BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
 _NOTE(CONSTCOND) } while (0)
 #define	BF64_SET_SB(x, low, len, shift, bias, val) do { \
 	ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
 	ASSERT3S((val) >> (shift), >=, bias); \
 	BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
 _NOTE(CONSTCOND) } while (0)
 
 /*
  * We currently support block sizes from 512 bytes to 16MB.
  * The benefits of larger blocks, and thus larger IO, need to be weighed
  * against the cost of COWing a giant block to modify one byte, and the
  * large latency of reading or writing a large block.
  *
  * Note that although blocks up to 16MB are supported, the recordsize
  * property can not be set larger than zfs_max_recordsize (default 1MB).
  * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
  *
  * Note that although the LSIZE field of the blkptr_t can store sizes up
  * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
  * 32MB - 512 bytes.  Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
  */
 #define	SPA_MINBLOCKSHIFT	9
 #define	SPA_OLD_MAXBLOCKSHIFT	17
 #define	SPA_MAXBLOCKSHIFT	24
 #define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
 #define	SPA_OLD_MAXBLOCKSIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)
 #define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
 
 /*
  * Default maximum supported logical ashift.
  *
  * The current 8k allocation block size limit is due to the 8k
  * aligned/sized operations performed by vdev_probe() on
  * vdev_label->vl_pad2.  Using another "safe region" for these tests
  * would allow the limit to be raised to 16k, at the expense of
  * only having 8 available uberblocks in the label area.
  */
 #define	SPA_MAXASHIFT		13
 
 /*
  * Default minimum supported logical ashift.
  */
 #define SPA_MINASHIFT		SPA_MINBLOCKSHIFT
 
 /*
  * Size of block to hold the configuration data (a packed nvlist)
  */
 #define	SPA_CONFIG_BLOCKSIZE	(1ULL << 14)
 
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)
  * to support up to 4-way RAID-Z mirror mode with worst-case gang block
  * overhead, three DVAs per bp, plus one more bit in case we do anything
  * else that expands the ASIZE.
  */
 #define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
 
 /*
  * All SPA data is represented by 128-bit data virtual addresses (DVAs).
  * The members of the dva_t should be considered opaque outside the SPA.
  */
 typedef struct dva {
 	uint64_t	dva_word[2];
 } dva_t;
 
 /*
  * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
  */
 typedef struct zio_cksum {
 	uint64_t	zc_word[4];
 } zio_cksum_t;
 
 /*
  * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
  * secret and is suitable for use in MAC algorithms as the key.
  */
 typedef struct zio_cksum_salt {
 	uint8_t		zcs_bytes[32];
 } zio_cksum_salt_t;
 
 /*
  * Each block is described by its DVAs, time of birth, checksum, etc.
  * The word-by-word, bit-by-bit layout of the blkptr is as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|		vdev1		| GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 2	|		vdev2		| GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 4	|		vdev3		| GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|			fill count				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * c	|			checksum[0]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * d	|			checksum[1]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * e	|			checksum[2]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * f	|			checksum[3]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * vdev		virtual device ID
  * offset	offset into virtual device
  * LSIZE	logical size
  * PSIZE	physical size (after compression)
  * ASIZE	allocated size (including RAID-Z parity and gang block headers)
  * GRID		RAID-Z layout information (reserved for future use)
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
  * B		byteorder (endianness)
  * D		dedup
  * X		encryption (on version 30, which is not supported)
  * E		blkptr_t contains embedded data (see below)
  * lvl		level of indirection
  * type		DMU object type
  * phys birth	txg of block allocation; zero if same as logical birth txg
  * log. birth	transaction group in which the block was logically born
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
 
 /*
  * "Embedded" blkptr_t's don't actually point to a block, instead they
  * have a data payload embedded in the blkptr_t itself.  See the comment
  * in blkptr.c for more details.
  *
  * The blkptr_t is laid out as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|      payload                                                  |
  * 1	|      payload                                                  |
  * 2	|      payload                                                  |
  * 3	|      payload                                                  |
  * 4	|      payload                                                  |
  * 5	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| etype |E| comp| PSIZE|              LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|      payload                                                  |
  * 8	|      payload                                                  |
  * 9	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|      payload                                                  |
  * c	|      payload                                                  |
  * d	|      payload                                                  |
  * e	|      payload                                                  |
  * f	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * payload		contains the embedded data
  * B (byteorder)	byteorder (endianness)
  * D (dedup)		padding (set to zero)
  * X			encryption (set to zero; see above)
  * E (embedded)		set to one
  * lvl			indirection level
  * type			DMU object type
  * etype		how to interpret embedded data (BP_EMBEDDED_TYPE_*)
  * comp			compression function of payload
  * PSIZE		size of payload after compression, in bytes
  * LSIZE		logical size of payload, in bytes
  *			note that 25 bits is enough to store the largest
  *			"normal" BP's LSIZE (2^16 * 2^9) in bytes
  * log. birth		transaction group in which the block was logically born
  *
  * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
  * bp's they are stored in units of SPA_MINBLOCKSHIFT.
  * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
  * The B, D, X, lvl, type, and comp fields are stored the same as with normal
  * BP's so the BP_SET_* macros can be used with them.  etype, PSIZE, LSIZE must
  * be set with the BPE_SET_* macros.  BP_SET_EMBEDDED() should be called before
  * other macros, as they assert that they are only used on BP's of the correct
  * "embedded-ness".
  */
 
 #define	BPE_GET_ETYPE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BPE_SET_ETYPE(bp, t)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, t); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BPE_GET_LSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
 #define	BPE_SET_LSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BPE_GET_PSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
 #define	BPE_SET_PSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 typedef enum bp_embedded_type {
 	BP_EMBEDDED_TYPE_DATA,
 	BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
 	NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
 } bp_embedded_type_t;
 
 #define	BPE_NUM_WORDS 14
 #define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
 #define	BPE_IS_PAYLOADWORD(bp, wp) \
 	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
 
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 
 /*
  * A block is a hole when it has either 1) never been written to, or
  * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
  * without physically allocating disk space. Holes are represented in the
  * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
  * done through the BP_IS_HOLE macro. For holes, the logical size, level,
  * DMU object type, and birth times are all also stored for holes that
  * were written to at some point (i.e. were punched after having been filled).
  */
 typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
 	uint64_t	blk_pad[2];	/* Extra space for the future	    */
 	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
 	uint64_t	blk_birth;	/* transaction group at birth	    */
 	uint64_t	blk_fill;	/* fill count			    */
 	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
 } blkptr_t;
 
 /*
  * Macros to get and set fields in a bp or DVA.
  */
 #define	DVA_GET_ASIZE(dva)	\
 	BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_ASIZE(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
 	SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
 #define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
 
 #define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, 32)
 #define	DVA_SET_VDEV(dva, x)	BF64_SET((dva)->dva_word[0], 32, 32, x)
 
 #define	DVA_GET_OFFSET(dva)	\
 	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_OFFSET(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ?	\
 	(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
 	BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_LSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_PSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_PSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_COMPRESS(bp)		BF64_GET((bp)->blk_prop, 32, 7)
 #define	BP_SET_COMPRESS(bp, x)		BF64_SET((bp)->blk_prop, 32, 7, x)
 
 #define	BP_IS_EMBEDDED(bp)		BF64_GET((bp)->blk_prop, 39, 1)
 #define	BP_SET_EMBEDDED(bp, x)		BF64_SET((bp)->blk_prop, 39, 1, x)
 
 #define	BP_GET_CHECKSUM(bp)		\
 	(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BP_SET_CHECKSUM(bp, x)		do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
 #define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
 
 #define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
 #define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
 
 #define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
 #define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
 
 #define	BP_GET_BYTEORDER(bp)		BF64_GET((bp)->blk_prop, 63, 1)
 #define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
 
 #define	BP_PHYSICAL_BIRTH(bp)		\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
 
 #define	BP_SET_BIRTH(bp, logical, physical)	\
 {						\
 	ASSERT(!BP_IS_EMBEDDED(bp));		\
 	(bp)->blk_birth = (logical);		\
 	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
 }
 
 #define	BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
 
 #define	BP_GET_ASIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
 #define	BP_GET_UCSIZE(bp) \
 	((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
 	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define	BP_GET_NDVAS(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
 #define	BP_COUNT_GANG(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
 	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
 	DVA_GET_GANG(&(bp)->blk_dva[2])))
 
 #define	DVA_EQUAL(dva1, dva2)	\
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
 #define	BP_EQUAL(bp1, bp2)	\
 	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
 	(bp1)->blk_birth == (bp2)->blk_birth &&			\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
 
 #define	ZIO_CHECKSUM_EQUAL(zc1, zc2) \
 	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
 	((zc1).zc_word[1] - (zc2).zc_word[1]) | \
 	((zc1).zc_word[2] - (zc2).zc_word[2]) | \
 	((zc1).zc_word[3] - (zc2).zc_word[3])))
 
 #define	ZIO_CHECKSUM_IS_ZERO(zc) \
 	(0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \
 	(zc)->zc_word[2] | (zc)->zc_word[3]))
 
 #define	ZIO_CHECKSUM_BSWAP(zcp)					\
 {								\
 	(zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]);	\
 	(zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]);	\
 	(zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]);	\
 	(zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]);	\
 }
 
 
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
 {						\
 	(zcp)->zc_word[0] = w0;			\
 	(zcp)->zc_word[1] = w1;			\
 	(zcp)->zc_word[2] = w2;			\
 	(zcp)->zc_word[3] = w3;			\
 }
 
 #define	BP_IDENTITY(bp)		(ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		\
 	(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
 #define	DVA_IS_EMPTY(dva)	((dva)->dva_word[0] == 0ULL &&	\
 				(dva)->dva_word[1] == 0ULL)
 #define	BP_IS_HOLE(bp) \
 	(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
 
 /* BP_IS_RAIDZ(bp) assumes no block compression */
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
 				BP_GET_PSIZE(bp))
 
 #define	BP_ZERO(bp)				\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
 	(bp)->blk_dva[1].dva_word[0] = 0;	\
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
 	(bp)->blk_phys_birth = 0;		\
 	(bp)->blk_birth = 0;			\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
 #if BYTE_ORDER == _BIG_ENDIAN
 #define	ZFS_HOST_BYTEORDER	(0ULL)
 #else
 #define	ZFS_HOST_BYTEORDER	(1ULL)
 #endif
 
 #define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
 
 #define	BP_SPRINTF_LEN	320
 
 /*
  * This macro allows code sharing between zfs, libzpool, and mdb.
  * 'func' is either snprintf() or mdb_snprintf().
  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
  */
 #define	SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
 {									\
 	static const char *copyname[] =					\
 	    { "zero", "single", "double", "triple" };			\
 	int len = 0;							\
 	int copies = 0;							\
 									\
 	if (bp == NULL) {						\
 		len += func(buf + len, size - len, "<NULL>");		\
 	} else if (BP_IS_HOLE(bp)) {					\
 		len += func(buf + len, size - len,			\
 		    "HOLE [L%llu %s] "					\
 		    "size=%llxL birth=%lluL",				\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else if (BP_IS_EMBEDDED(bp)) {				\
 		len = func(buf + len, size - len,			\
 		    "EMBEDDED [L%llu %s] et=%u %s "			\
 		    "size=%llxL/%llxP birth=%lluL",			\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (int)BPE_GET_ETYPE(bp),				\
 		    compress,						\
 		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
 		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else {							\
 		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
 			const dva_t *dva = &bp->blk_dva[d];		\
 			if (DVA_IS_VALID(dva))				\
 				copies++;				\
 			len += func(buf + len, size - len,		\
 			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
 			    (u_longlong_t)DVA_GET_VDEV(dva),		\
 			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
 			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
 			    ws);					\
 		}							\
 		if (BP_IS_GANG(bp) &&					\
 		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
 		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
 			copies--;					\
 		len += func(buf + len, size - len,			\
 		    "[L%llu %s] %s %s %s %s %s %s%c"			\
 		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
 		    "cksum=%llx:%llx:%llx:%llx",			\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    checksum,						\
 		    compress,						\
 		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
 		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
 		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
 		    copyname[copies],					\
 		    ws,							\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth,			\
 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
 		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
 	}								\
 	ASSERT(len < size);						\
 }
 
 #include <sys/dmu.h>
 
 #define	BP_GET_BUFC_TYPE(bp)						\
 	(((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \
 	ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 typedef enum spa_import_type {
 	SPA_IMPORT_EXISTING,
 	SPA_IMPORT_ASSEMBLE
 } spa_import_type_t;
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
 extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
     nvlist_t *policy, nvlist_t **config);
 extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
     size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
     nvlist_t *zplprops);
 #ifdef illumos
 extern int spa_import_rootpool(char *devpath, char *devid);
 #else
 extern int spa_import_rootpool(const char *name);
 #endif
 extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
     uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
 extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce);
 extern int spa_reset(char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
 extern void spa_async_unrequest(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
 extern void spa_scan_stat_init(spa_t *spa);
 extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 
 #define	SPA_ASYNC_CONFIG_UPDATE	0x01
 #define	SPA_ASYNC_REMOVE	0x02
 #define	SPA_ASYNC_PROBE		0x04
 #define	SPA_ASYNC_RESILVER_DONE	0x08
 #define	SPA_ASYNC_RESILVER	0x10
 #define	SPA_ASYNC_AUTOEXPAND	0x20
 #define	SPA_ASYNC_REMOVE_DONE	0x40
 #define	SPA_ASYNC_REMOVE_STOP	0x80
 
 /*
  * Controls the behavior of spa_vdev_remove().
  */
 #define	SPA_REMOVE_UNSPARE	0x01
 #define	SPA_REMOVE_DONE		0x02
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp);
 
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
 extern void spa_spare_remove(vdev_t *vd);
 extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
 extern void spa_spare_activate(vdev_t *vd);
 
 /* L2ARC state (which is global across all pools) */
 extern void spa_l2cache_add(vdev_t *vd);
 extern void spa_l2cache_remove(vdev_t *vd);
 extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
 
 /* scanning */
 extern int spa_scan(spa_t *spa, pool_scan_func_t func);
 extern int spa_scan_stop(spa_t *spa);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
 /*
  * SPA configuration functions in spa_config.c
  */
 
 #define	SPA_CONFIG_UPDATE_POOL	0
 #define	SPA_CONFIG_UPDATE_VDEVS	1
 
 extern void spa_config_sync(spa_t *, boolean_t, boolean_t);
 extern void spa_config_load(void);
 extern nvlist_t *spa_all_configs(uint64_t *);
 extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);
 extern void spa_config_update(spa_t *spa, int what);
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
  */
 
 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
 extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);
 
 /* Refcount functions */
 extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
 extern void spa_async_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 #define	SCL_NONE	0x00
 #define	SCL_CONFIG	0x01
 #define	SCL_STATE	0x02
 #define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
 #define	SCL_ALLOC	0x08
 #define	SCL_ZIO		0x10
 #define	SCL_FREE	0x20
 #define	SCL_VDEV	0x40
 #define	SCL_LOCKS	7
 #define	SCL_ALL		((1 << SCL_LOCKS) - 1)
 #define	SCL_STATE_ALL	(SCL_STATE | SCL_L2ARC | SCL_ZIO)
 
 /* Pool configuration locks */
 extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
 extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
 extern void spa_config_exit(spa_t *spa, int locks, void *tag);
 extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
 extern uint64_t spa_vdev_config_enter(spa_t *spa);
 extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
     int error, char *tag);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
 /* Pool vdev state change lock */
 extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
 
 /* Log state */
 typedef enum spa_log_state {
 	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
 	SPA_LOG_MISSING,	/* missing log(s) */
 	SPA_LOG_CLEAR,		/* clear the log(s) */
 	SPA_LOG_GOOD,		/* log(s) are good */
 } spa_log_state_t;
 
 extern spa_log_state_t spa_get_log_state(spa_t *spa);
 extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
 extern int spa_offline_log(spa_t *spa);
 
 /* Log claim callback */
 extern void spa_claim_notify(zio_t *zio);
 
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
 extern boolean_t spa_is_initializing(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
 extern void spa_altroot(spa_t *, char *, size_t);
 extern int spa_sync_pass(spa_t *spa);
 extern char *spa_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_load_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
 extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
 extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern void spa_evicting_os_register(spa_t *, objset_t *os);
 extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
 extern void spa_evicting_os_wait(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_prev_software_version(spa_t *spa);
 extern int spa_busy(void);
 extern uint8_t spa_get_failmode(spa_t *spa);
 extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
     dmu_tx_t *tx);
 extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
 extern int spa_rename(const char *oldname, const char *newname);
 extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
 extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern int spa_change_guid(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
     boolean_t l2cache);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
 extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
 extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
 extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
 extern int spa_maxblocksize(spa_t *spa);
 extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
 
 extern int spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
 #define	strtonum(str, nptr)	zfs_strtonum((str), (nptr))
 
 extern char *spa_his_ievent_table[];
 
 extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
 extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf);
 extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl);
 extern void spa_history_log_version(spa_t *spa, const char *operation);
 extern void spa_history_log_internal(spa_t *spa, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...);
 extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
     dmu_tx_t *tx, const char *fmt, ...);
 extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...);
 
 /* error handling */
 struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, zio_t *zio);
 extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd,
     zio_t *zio, uint64_t stateoroffset, uint64_t length);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
 extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_get_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
 extern void spa_errlog_rotate(spa_t *spa);
 extern void spa_errlog_drain(spa_t *spa);
 extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
 extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
 
 /* vdev cache */
 extern void vdev_cache_stat_init(void);
 extern void vdev_cache_stat_fini(void);
 
 /* Initialization and termination */
 extern void spa_init(int flags);
 extern void spa_fini(void);
 extern void spa_boot_init();
 
 /* properties */
 extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
 extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
 extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
 extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
 /* asynchronous event notification */
 extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
 	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));	\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
 extern boolean_t spa_debug_enabled(spa_t *spa);
 #define	spa_dbgmsg(spa, ...)			\
 {						\
 	if (spa_debug_enabled(spa))		\
 		zfs_dbgmsg(__VA_ARGS__);	\
 }
 
 extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_SPA_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h	(revision 296519)
@@ -1,238 +1,239 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_ZAP_IMPL_H
 #define	_SYS_ZAP_IMPL_H
 
 #include <sys/zap.h>
 #include <sys/zfs_context.h>
 #include <sys/avl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 extern int fzap_default_block_shift;
 
 #define	ZAP_MAGIC 0x2F52AB2ABULL
 
 #define	FZAP_BLOCK_SHIFT(zap)	((zap)->zap_f.zap_block_shift)
 
 #define	MZAP_ENT_LEN		64
 #define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
 #define	MZAP_MAX_BLKSZ		SPA_OLD_MAXBLOCKSIZE
 
 #define	ZAP_NEED_CD		(-1U)
 
 typedef struct mzap_ent_phys {
 	uint64_t mze_value;
 	uint32_t mze_cd;
 	uint16_t mze_pad;	/* in case we want to chain them someday */
 	char mze_name[MZAP_NAME_LEN];
 } mzap_ent_phys_t;
 
 typedef struct mzap_phys {
 	uint64_t mz_block_type;	/* ZBT_MICRO */
 	uint64_t mz_salt;
 	uint64_t mz_normflags;
 	uint64_t mz_pad[5];
 	mzap_ent_phys_t mz_chunk[1];
 	/* actually variable size depending on block size */
 } mzap_phys_t;
 
 typedef struct mzap_ent {
 	avl_node_t mze_node;
 	int mze_chunkid;
 	uint64_t mze_hash;
 	uint32_t mze_cd; /* copy from mze_phys->mze_cd */
 } mzap_ent_t;
 
 #define	MZE_PHYS(zap, mze) \
 	(&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid])
 
 /*
  * The (fat) zap is stored in one object. It is an array of
  * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
  *
  * ptrtbl fits in first block:
  * 	[zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
  *
  * ptrtbl too big for first block:
  * 	[zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
  *
  */
 
 struct dmu_buf;
 struct zap_leaf;
 
 #define	ZBT_LEAF		((1ULL << 63) + 0)
 #define	ZBT_HEADER		((1ULL << 63) + 1)
 #define	ZBT_MICRO		((1ULL << 63) + 3)
 /* any other values are ptrtbl blocks */
 
 /*
  * the embedded pointer table takes up half a block:
  * block size / entry size (2^3) / 2
  */
 #define	ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
 
 /*
  * The embedded pointer table starts half-way through the block.  Since
  * the pointer table itself is half the block, it starts at (64-bit)
  * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
  */
 #define	ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
 	((uint64_t *)zap_f_phys(zap)) \
 	[(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
 
 /*
  * TAKE NOTE:
  * If zap_phys_t is modified, zap_byteswap() must be modified.
  */
 typedef struct zap_phys {
 	uint64_t zap_block_type;	/* ZBT_HEADER */
 	uint64_t zap_magic;		/* ZAP_MAGIC */
 
 	struct zap_table_phys {
 		uint64_t zt_blk;	/* starting block number */
 		uint64_t zt_numblks;	/* number of blocks */
 		uint64_t zt_shift;	/* bits to index it */
 		uint64_t zt_nextblk;	/* next (larger) copy start block */
 		uint64_t zt_blks_copied; /* number source blocks copied */
 	} zap_ptrtbl;
 
 	uint64_t zap_freeblk;		/* the next free block */
 	uint64_t zap_num_leafs;		/* number of leafs */
 	uint64_t zap_num_entries;	/* number of entries */
 	uint64_t zap_salt;		/* salt to stir into hash function */
 	uint64_t zap_normflags;		/* flags for u8_textprep_str() */
 	uint64_t zap_flags;		/* zap_flags_t */
 	/*
 	 * This structure is followed by padding, and then the embedded
 	 * pointer table.  The embedded pointer table takes up second
 	 * half of the block.  It is accessed using the
 	 * ZAP_EMBEDDED_PTRTBL_ENT() macro.
 	 */
 } zap_phys_t;
 
 typedef struct zap_table_phys zap_table_phys_t;
 
 typedef struct zap {
 	dmu_buf_user_t zap_dbu;
 	objset_t *zap_objset;
 	uint64_t zap_object;
 	struct dmu_buf *zap_dbuf;
 	krwlock_t zap_rwlock;
 	boolean_t zap_ismicro;
 	int zap_normflags;
 	uint64_t zap_salt;
 	union {
 		struct {
 			/*
 			 * zap_num_entries_mtx protects
 			 * zap_num_entries
 			 */
 			kmutex_t zap_num_entries_mtx;
 			int zap_block_shift;
 		} zap_fat;
 		struct {
 			int16_t zap_num_entries;
 			int16_t zap_num_chunks;
 			int16_t zap_alloc_next;
 			avl_tree_t zap_avl;
 		} zap_micro;
 	} zap_u;
 } zap_t;
 
 inline zap_phys_t *
 zap_f_phys(zap_t *zap)
 {
 	return (zap->zap_dbuf->db_data);
 }
 
 inline mzap_phys_t *
 zap_m_phys(zap_t *zap)
 {
 	return (zap->zap_dbuf->db_data);
 }
 
 typedef struct zap_name {
 	zap_t *zn_zap;
 	int zn_key_intlen;
 	const void *zn_key_orig;
 	int zn_key_orig_numints;
 	const void *zn_key_norm;
 	int zn_key_norm_numints;
 	uint64_t zn_hash;
 	matchtype_t zn_matchtype;
 	char zn_normbuf[ZAP_MAXNAMELEN];
 } zap_name_t;
 
 #define	zap_f	zap_u.zap_fat
 #define	zap_m	zap_u.zap_micro
 
 boolean_t zap_match(zap_name_t *zn, const char *matchname);
 int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
 void zap_unlockdir(zap_t *zap);
 void zap_evict(void *dbu);
 zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
 void zap_name_free(zap_name_t *zn);
 int zap_hashbits(zap_t *zap);
 uint32_t zap_maxcd(zap_t *zap);
 uint64_t zap_getflags(zap_t *zap);
 
 #define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
 
 void fzap_byteswap(void *buf, size_t size);
 int fzap_count(zap_t *zap, uint64_t *count);
 int fzap_lookup(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     char *realname, int rn_len, boolean_t *normalization_conflictp);
 void fzap_prefetch(zap_name_t *zn);
 int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
     uint64_t *tooverwrite);
 int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
 int fzap_update(zap_name_t *zn,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
 int fzap_length(zap_name_t *zn,
     uint64_t *integer_size, uint64_t *num_integers);
 int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
 int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
 void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
 void zap_put_leaf(struct zap_leaf *l);
 
 int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
 int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_ZAP_IMPL_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	(revision 296519)
@@ -1,451 +1,452 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_ZFS_IOCTL_H
 #define	_SYS_ZFS_IOCTL_H
 
 #include <sys/cred.h>
 #include <sys/dmu.h>
 #include <sys/zio.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zfs_stat.h>
 
 #ifdef _KERNEL
 #include <sys/nvpair.h>
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * The structures in this file are passed between userland and the
  * kernel.  Userland may be running a 32-bit process, while the kernel
  * is 64-bit.  Therefore, these structures need to compile the same in
  * 32-bit and 64-bit.  This means not using type "long", and adding
  * explicit padding so that the 32-bit structure will not be packed more
  * tightly than the 64-bit structure (which requires 64-bit alignment).
  */
 
 /*
  * Property values for snapdir
  */
 #define	ZFS_SNAPDIR_HIDDEN		0
 #define	ZFS_SNAPDIR_VISIBLE		1
 
 /*
  * Field manipulation macros for the drr_versioninfo field of the
  * send stream header.
  */
 
 /*
  * Header types for zfs send streams.
  */
 typedef enum drr_headertype {
 	DMU_SUBSTREAM = 0x1,
 	DMU_COMPOUNDSTREAM = 0x2
 } drr_headertype_t;
 
 #define	DMU_GET_STREAM_HDRTYPE(vi)	BF64_GET((vi), 0, 2)
 #define	DMU_SET_STREAM_HDRTYPE(vi, x)	BF64_SET((vi), 0, 2, x)
 
 #define	DMU_GET_FEATUREFLAGS(vi)	BF64_GET((vi), 2, 30)
 #define	DMU_SET_FEATUREFLAGS(vi, x)	BF64_SET((vi), 2, 30, x)
 
 /*
  * Feature flags for zfs send streams (flags in drr_versioninfo)
  */
 
 #define	DMU_BACKUP_FEATURE_DEDUP		(1 << 0)
 #define	DMU_BACKUP_FEATURE_DEDUPPROPS		(1 << 1)
 #define	DMU_BACKUP_FEATURE_SA_SPILL		(1 << 2)
 /* flags #3 - #15 are reserved for incompatible closed-source implementations */
 #define	DMU_BACKUP_FEATURE_EMBED_DATA		(1 << 16)
 #define	DMU_BACKUP_FEATURE_EMBED_DATA_LZ4	(1 << 17)
 /* flag #18 is reserved for a Delphix feature */
 #define	DMU_BACKUP_FEATURE_LARGE_BLOCKS		(1 << 19)
 #define	DMU_BACKUP_FEATURE_RESUMING		(1 << 20)
 
 /*
  * Mask of all supported backup features
  */
 #define	DMU_BACKUP_FEATURE_MASK	(DMU_BACKUP_FEATURE_DEDUP | \
     DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
     DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
     DMU_BACKUP_FEATURE_RESUMING | \
     DMU_BACKUP_FEATURE_LARGE_BLOCKS)
 
 /* Are all features in the given flag word currently supported? */
 #define	DMU_STREAM_SUPPORTED(x)	(!((x) & ~DMU_BACKUP_FEATURE_MASK))
 
 typedef enum dmu_send_resume_token_version {
 	ZFS_SEND_RESUME_TOKEN_VERSION = 1
 } dmu_send_resume_token_version_t;
 
 /*
  * The drr_versioninfo field of the dmu_replay_record has the
  * following layout:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *  	|		reserved	|        feature-flags	    |C|S|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * The low order two bits indicate the header type: SUBSTREAM (0x1)
  * or COMPOUNDSTREAM (0x2).  Using two bits for this is historical:
  * this field used to be a version number, where the two version types
  * were 1 and 2.  Using two bits for this allows earlier versions of
  * the code to be able to recognize send streams that don't use any
  * of the features indicated by feature flags.
  */
 
 #define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
 
 /*
  * Send stream flags.  Bits 24-31 are reserved for vendor-specific
  * implementations and should not be used.
  */
 #define	DRR_FLAG_CLONE		(1<<0)
 #define	DRR_FLAG_CI_DATA	(1<<1)
 /*
  * This send stream, if it is a full send, includes the FREE and FREEOBJECT
  * records that are created by the sending process.  This means that the send
  * stream can be received as a clone, even though it is not an incremental.
  * This is not implemented as a feature flag, because the receiving side does
  * not need to have implemented it to receive this stream; it is fully backwards
  * compatible.  We need a flag, though, because full send streams without it
  * cannot necessarily be received as a clone correctly.
  */
 #define	DRR_FLAG_FREERECORDS	(1<<2)
 
 /*
  * flags in the drr_checksumflags field in the DRR_WRITE and
  * DRR_WRITE_BYREF blocks
  */
 #define	DRR_CHECKSUM_DEDUP	(1<<0)
 
 #define	DRR_IS_DEDUP_CAPABLE(flags)	((flags) & DRR_CHECKSUM_DEDUP)
 
 /*
  * zfs ioctl command structure
  */
 struct drr_begin {
 	uint64_t drr_magic;
 	uint64_t drr_versioninfo; /* was drr_version */
 	uint64_t drr_creation_time;
 	dmu_objset_type_t drr_type;
 	uint32_t drr_flags;
 	uint64_t drr_toguid;
 	uint64_t drr_fromguid;
 	char drr_toname[MAXNAMELEN];
 };
 
 struct drr_end {
 	zio_cksum_t drr_checksum;
 	uint64_t drr_toguid;
 };
 
 struct drr_object {
 	uint64_t drr_object;
 	dmu_object_type_t drr_type;
 	dmu_object_type_t drr_bonustype;
 	uint32_t drr_blksz;
 	uint32_t drr_bonuslen;
 	uint8_t drr_checksumtype;
 	uint8_t drr_compress;
 	uint8_t drr_pad[6];
 	uint64_t drr_toguid;
 	/* bonus content follows */
 };
 
 struct drr_freeobjects {
 	uint64_t drr_firstobj;
 	uint64_t drr_numobjs;
 	uint64_t drr_toguid;
 };
 
 struct drr_write {
 	uint64_t drr_object;
 	dmu_object_type_t drr_type;
 	uint32_t drr_pad;
 	uint64_t drr_offset;
 	uint64_t drr_length;
 	uint64_t drr_toguid;
 	uint8_t drr_checksumtype;
 	uint8_t drr_checksumflags;
 	uint8_t drr_pad2[6];
 	ddt_key_t drr_key; /* deduplication key */
 	/* content follows */
 };
 
 struct drr_free {
 	uint64_t drr_object;
 	uint64_t drr_offset;
 	uint64_t drr_length;
 	uint64_t drr_toguid;
 };
 
 struct drr_write_byref {
 	/* where to put the data */
 	uint64_t drr_object;
 	uint64_t drr_offset;
 	uint64_t drr_length;
 	uint64_t drr_toguid;
 	/* where to find the prior copy of the data */
 	uint64_t drr_refguid;
 	uint64_t drr_refobject;
 	uint64_t drr_refoffset;
 	/* properties of the data */
 	uint8_t drr_checksumtype;
 	uint8_t drr_checksumflags;
 	uint8_t drr_pad2[6];
 	ddt_key_t drr_key; /* deduplication key */
 };
 
 struct drr_spill {
 	uint64_t drr_object;
 	uint64_t drr_length;
 	uint64_t drr_toguid;
 	uint64_t drr_pad[4]; /* needed for crypto */
 	/* spill data follows */
 };
 
 typedef struct dmu_replay_record {
 	enum {
 		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
 		DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
 		DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
 	} drr_type;
 	uint32_t drr_payloadlen;
 	union {
 		struct drr_begin drr_begin;
 		struct drr_end drr_end;
 		struct drr_object drr_object;
 		struct drr_freeobjects drr_freeobjects;
 		struct drr_write drr_write;
 		struct drr_free drr_free;
 		struct drr_write_byref drr_write_byref;
 		struct drr_spill drr_spill;
 		struct drr_write_embedded {
 			uint64_t drr_object;
 			uint64_t drr_offset;
 			/* logical length, should equal blocksize */
 			uint64_t drr_length;
 			uint64_t drr_toguid;
 			uint8_t drr_compression;
 			uint8_t drr_etype;
 			uint8_t drr_pad[6];
 			uint32_t drr_lsize; /* uncompressed size of payload */
 			uint32_t drr_psize; /* compr. (real) size of payload */
 			/* (possibly compressed) content follows */
 		} drr_write_embedded;
 
 		/*
 		 * Nore: drr_checksum is overlaid with all record types
 		 * except DRR_BEGIN.  Therefore its (non-pad) members
 		 * must not overlap with members from the other structs.
 		 * We accomplish this by putting its members at the very
 		 * end of the struct.
 		 */
 		struct drr_checksum {
 			uint64_t drr_pad[34];
 			/*
 			 * fletcher-4 checksum of everything preceding the
 			 * checksum.
 			 */
 			zio_cksum_t drr_checksum;
 		} drr_checksum;
 	} drr_u;
 } dmu_replay_record_t;
 
 /* diff record range types */
 typedef enum diff_type {
 	DDR_NONE = 0x1,
 	DDR_INUSE = 0x2,
 	DDR_FREE = 0x4
 } diff_type_t;
 
 /*
  * The diff reports back ranges of free or in-use objects.
  */
 typedef struct dmu_diff_record {
 	uint64_t ddr_type;
 	uint64_t ddr_first;
 	uint64_t ddr_last;
 } dmu_diff_record_t;
 
 typedef struct zinject_record {
 	uint64_t	zi_objset;
 	uint64_t	zi_object;
 	uint64_t	zi_start;
 	uint64_t	zi_end;
 	uint64_t	zi_guid;
 	uint32_t	zi_level;
 	uint32_t	zi_error;
 	uint64_t	zi_type;
 	uint32_t	zi_freq;
 	uint32_t	zi_failfast;
 	char		zi_func[MAXNAMELEN];
 	uint32_t	zi_iotype;
 	int32_t		zi_duration;
 	uint64_t	zi_timer;
 	uint64_t	zi_nlanes;
 	uint32_t	zi_cmd;
 	uint32_t	zi_pad;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
 #define	ZINJECT_FLUSH_ARC	0x2
 #define	ZINJECT_UNLOAD_SPA	0x4
 
 typedef enum zinject_type {
 	ZINJECT_UNINITIALIZED,
 	ZINJECT_DATA_FAULT,
 	ZINJECT_DEVICE_FAULT,
 	ZINJECT_LABEL_FAULT,
 	ZINJECT_IGNORED_WRITES,
 	ZINJECT_PANIC,
 	ZINJECT_DELAY_IO,
 } zinject_type_t;
 
 typedef struct zfs_share {
 	uint64_t	z_exportdata;
 	uint64_t	z_sharedata;
 	uint64_t	z_sharetype;	/* 0 = share, 1 = unshare */
 	uint64_t	z_sharemax;  /* max length of share string */
 } zfs_share_t;
 
 /*
  * ZFS file systems may behave the usual, POSIX-compliant way, where
  * name lookups are case-sensitive.  They may also be set up so that
  * all the name lookups are case-insensitive, or so that only some
  * lookups, the ones that set an FIGNORECASE flag, are case-insensitive.
  */
 typedef enum zfs_case {
 	ZFS_CASE_SENSITIVE,
 	ZFS_CASE_INSENSITIVE,
 	ZFS_CASE_MIXED
 } zfs_case_t;
 
 typedef struct zfs_cmd {
 	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
 	uint64_t	zc_nvlist_src;		/* really (char *) */
 	uint64_t	zc_nvlist_src_size;
 	uint64_t	zc_nvlist_dst;		/* really (char *) */
 	uint64_t	zc_nvlist_dst_size;
 	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
 	int		zc_pad2;
 
 	/*
 	 * The following members are for legacy ioctls which haven't been
 	 * converted to the new method.
 	 */
 	uint64_t	zc_history;		/* really (char *) */
 	char		zc_value[MAXPATHLEN * 2];
 	char		zc_string[MAXNAMELEN];
 	uint64_t	zc_guid;
 	uint64_t	zc_nvlist_conf;		/* really (char *) */
 	uint64_t	zc_nvlist_conf_size;
 	uint64_t	zc_cookie;
 	uint64_t	zc_objset_type;
 	uint64_t	zc_perm_action;
 	uint64_t	zc_history_len;
 	uint64_t	zc_history_offset;
 	uint64_t	zc_obj;
 	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
 	zfs_share_t	zc_share;
 	uint64_t	zc_jailid;
 	dmu_objset_stats_t zc_objset_stats;
 	dmu_replay_record_t zc_begin_record;
 	zinject_record_t zc_inject_record;
 	uint32_t	zc_defer_destroy;
 	uint32_t	zc_flags;
 	uint64_t	zc_action_handle;
 	int		zc_cleanup_fd;
 	uint8_t		zc_simple;
 	boolean_t	zc_resumable;
 	uint64_t	zc_sendobj;
 	uint64_t	zc_fromobj;
 	uint64_t	zc_createtxg;
 	zfs_stat_t	zc_stat;
 } zfs_cmd_t;
 
 typedef struct zfs_useracct {
 	char zu_domain[256];
 	uid_t zu_rid;
 	uint32_t zu_pad;
 	uint64_t zu_space;
 } zfs_useracct_t;
 
 #define	ZFSDEV_MAX_MINOR	(1 << 16)
 #define	ZFS_MIN_MINOR	(ZFSDEV_MAX_MINOR + 1)
 
 #define	ZPOOL_EXPORT_AFTER_SPLIT 0x1
 
 #ifdef _KERNEL
 
 typedef struct zfs_creat {
 	nvlist_t	*zct_zplprops;
 	nvlist_t	*zct_props;
 } zfs_creat_t;
 
 extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
 extern int zfs_secpolicy_rename_perms(const char *from,
     const char *to, cred_t *cr);
 extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern int zfs_busy(void);
 extern int zfs_unmount_snap(const char *);
 extern void zfs_destroy_unmount_origin(const char *);
 
 /*
  * ZFS minor numbers can refer to either a control device instance or
  * a zvol. Depending on the value of zss_type, zss_data points to either
  * a zvol_state_t or a zfs_onexit_t.
  */
 enum zfs_soft_state_type {
 	ZSST_ZVOL,
 	ZSST_CTLDEV
 };
 
 typedef struct zfs_soft_state {
 	enum zfs_soft_state_type zss_type;
 	void *zss_data;
 } zfs_soft_state_t;
 
 extern void *zfsdev_get_soft_state(minor_t minor,
     enum zfs_soft_state_type which);
 extern minor_t zfsdev_minor_alloc(void);
 
 extern void *zfsdev_state;
 
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_ZFS_IOCTL_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h	(revision 296519)
@@ -1,374 +1,375 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_FS_ZFS_ZNODE_H
 #define	_SYS_FS_ZFS_ZNODE_H
 
 #ifdef _KERNEL
 #include <sys/list.h>
 #include <sys/dmu.h>
 #include <sys/sa.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/rrwlock.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_stat.h>
 #endif
 #include <sys/zfs_acl.h>
 #include <sys/zil.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Additional file level attributes, that are stored
  * in the upper half of zp_flags
  */
 #define	ZFS_READONLY		0x0000000100000000
 #define	ZFS_HIDDEN		0x0000000200000000
 #define	ZFS_SYSTEM		0x0000000400000000
 #define	ZFS_ARCHIVE		0x0000000800000000
 #define	ZFS_IMMUTABLE		0x0000001000000000
 #define	ZFS_NOUNLINK		0x0000002000000000
 #define	ZFS_APPENDONLY		0x0000004000000000
 #define	ZFS_NODUMP		0x0000008000000000
 #define	ZFS_OPAQUE		0x0000010000000000
 #define	ZFS_AV_QUARANTINED 	0x0000020000000000
 #define	ZFS_AV_MODIFIED 	0x0000040000000000
 #define	ZFS_REPARSE		0x0000080000000000
 #define	ZFS_OFFLINE		0x0000100000000000
 #define	ZFS_SPARSE		0x0000200000000000
 
 #define	ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
 { \
 	if (value) \
 		pflags |= attr; \
 	else \
 		pflags &= ~attr; \
 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
 	    &pflags, sizeof (pflags), tx)); \
 }
 
 /*
  * Define special zfs pflags
  */
 #define	ZFS_XATTR		0x1		/* is an extended attribute */
 #define	ZFS_INHERIT_ACE		0x2		/* ace has inheritable ACEs */
 #define	ZFS_ACL_TRIVIAL 	0x4		/* files ACL is trivial */
 #define	ZFS_ACL_OBJ_ACE 	0x8		/* ACL has CMPLX Object ACE */
 #define	ZFS_ACL_PROTECTED	0x10		/* ACL protected */
 #define	ZFS_ACL_DEFAULTED	0x20		/* ACL should be defaulted */
 #define	ZFS_ACL_AUTO_INHERIT	0x40		/* ACL should be inherited */
 #define	ZFS_BONUS_SCANSTAMP	0x80		/* Scanstamp in bonus area */
 #define	ZFS_NO_EXECS_DENIED	0x100		/* exec was given to everyone */
 
 #define	SA_ZPL_ATIME(z)		z->z_attr_table[ZPL_ATIME]
 #define	SA_ZPL_MTIME(z)		z->z_attr_table[ZPL_MTIME]
 #define	SA_ZPL_CTIME(z)		z->z_attr_table[ZPL_CTIME]
 #define	SA_ZPL_CRTIME(z)	z->z_attr_table[ZPL_CRTIME]
 #define	SA_ZPL_GEN(z)		z->z_attr_table[ZPL_GEN]
 #define	SA_ZPL_DACL_ACES(z)	z->z_attr_table[ZPL_DACL_ACES]
 #define	SA_ZPL_XATTR(z)		z->z_attr_table[ZPL_XATTR]
 #define	SA_ZPL_SYMLINK(z)	z->z_attr_table[ZPL_SYMLINK]
 #define	SA_ZPL_RDEV(z)		z->z_attr_table[ZPL_RDEV]
 #define	SA_ZPL_SCANSTAMP(z)	z->z_attr_table[ZPL_SCANSTAMP]
 #define	SA_ZPL_UID(z)		z->z_attr_table[ZPL_UID]
 #define	SA_ZPL_GID(z)		z->z_attr_table[ZPL_GID]
 #define	SA_ZPL_PARENT(z)	z->z_attr_table[ZPL_PARENT]
 #define	SA_ZPL_LINKS(z)		z->z_attr_table[ZPL_LINKS]
 #define	SA_ZPL_MODE(z)		z->z_attr_table[ZPL_MODE]
 #define	SA_ZPL_DACL_COUNT(z)	z->z_attr_table[ZPL_DACL_COUNT]
 #define	SA_ZPL_FLAGS(z)		z->z_attr_table[ZPL_FLAGS]
 #define	SA_ZPL_SIZE(z)		z->z_attr_table[ZPL_SIZE]
 #define	SA_ZPL_ZNODE_ACL(z)	z->z_attr_table[ZPL_ZNODE_ACL]
 #define	SA_ZPL_PAD(z)		z->z_attr_table[ZPL_PAD]
 
 /*
  * Is ID ephemeral?
  */
 #define	IS_EPHEMERAL(x)		(x > MAXUID)
 
 /*
  * Should we use FUIDs?
  */
 #define	USE_FUIDS(version, os)	(version >= ZPL_VERSION_FUID && \
     spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
 #define	USE_SA(version, os) (version >= ZPL_VERSION_SA && \
     spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
 
 #define	MASTER_NODE_OBJ	1
 
 /*
  * Special attributes for master node.
  * "userquota@" and "groupquota@" are also valid (from
  * zfs_userquota_prop_prefixes[]).
  */
 #define	ZFS_FSID		"FSID"
 #define	ZFS_UNLINKED_SET	"DELETE_QUEUE"
 #define	ZFS_ROOT_OBJ		"ROOT"
 #define	ZPL_VERSION_STR		"VERSION"
 #define	ZFS_FUID_TABLES		"FUID"
 #define	ZFS_SHARES_DIR		"SHARES"
 #define	ZFS_SA_ATTRS		"SA_ATTRS"
 
 /*
  * Path component length
  *
  * The generic fs code uses MAXNAMELEN to represent
  * what the largest component length is.  Unfortunately,
  * this length includes the terminating NULL.  ZFS needs
  * to tell the users via pathconf() and statvfs() what the
  * true maximum length of a component is, excluding the NULL.
  */
 #define	ZFS_MAXNAMELEN	(MAXNAMELEN - 1)
 
 /*
  * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in
  * the directory entries.
  */
 #ifndef IFTODT
 #define	IFTODT(mode) (((mode) & S_IFMT) >> 12)
 #endif
 
 /*
  * The directory entry has the type (currently unused on Solaris) in the
  * top 4 bits, and the object number in the low 48 bits.  The "middle"
  * 12 bits are unused.
  */
 #define	ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
 #define	ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
 
 /*
  * Directory entry locks control access to directory entries.
  * They are used to protect creates, deletes, and renames.
  * Each directory znode has a mutex and a list of locked names.
  */
 #ifdef _KERNEL
 typedef struct zfs_dirlock {
 	char		*dl_name;	/* directory entry being locked */
 	uint32_t	dl_sharecnt;	/* 0 if exclusive, > 0 if shared */
 	uint8_t		dl_namelock;	/* 1 if z_name_lock is NOT held */
 	uint16_t	dl_namesize;	/* set if dl_name was allocated */
 	kcondvar_t	dl_cv;		/* wait for entry to be unlocked */
 	struct znode	*dl_dzp;	/* directory znode */
 	struct zfs_dirlock *dl_next;	/* next in z_dirlocks list */
 } zfs_dirlock_t;
 
 typedef struct znode {
 	struct zfsvfs	*z_zfsvfs;
 	vnode_t		*z_vnode;
 	uint64_t	z_id;		/* object ID for this znode */
 	kmutex_t	z_lock;		/* znode modification lock */
 	krwlock_t	z_parent_lock;	/* parent lock for directories */
 	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
 	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
 	kmutex_t	z_range_lock;	/* protects changes to z_range_avl */
 	avl_tree_t	z_range_avl;	/* avl tree of file range locks */
 	uint8_t		z_unlinked;	/* file has been unlinked */
 	uint8_t		z_atime_dirty;	/* atime needs to be synced */
 	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
 	uint8_t		z_moved;	/* Has this znode been moved? */
 	uint_t		z_blksz;	/* block size in bytes */
 	uint_t		z_seq;		/* modification sequence number */
 	uint64_t	z_mapcnt;	/* number of pages mapped to file */
 	uint64_t	z_gen;		/* generation (cached) */
 	uint64_t	z_size;		/* file size (cached) */
 	uint64_t	z_atime[2];	/* atime (cached) */
 	uint64_t	z_links;	/* file links (cached) */
 	uint64_t	z_pflags;	/* pflags (cached) */
 	uint64_t	z_uid;		/* uid fuid (cached) */
 	uint64_t	z_gid;		/* gid fuid (cached) */
 	mode_t		z_mode;		/* mode (cached) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	zfs_acl_t	*z_acl_cached;	/* cached acl */
 	list_node_t	z_link_node;	/* all znodes in fs link */
 	sa_handle_t	*z_sa_hdl;	/* handle to sa data */
 	boolean_t	z_is_sa;	/* are we native sa? */
 } znode_t;
 
 
 /*
  * Range locking rules
  * --------------------
  * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
  *    file range needs to be locked as RL_WRITER. Only then can the pages be
  *    freed etc and zp_size reset. zp_size must be set within range lock.
  * 2. For writes and punching holes (zfs_write & zfs_space) just the range
  *    being written or freed needs to be locked as RL_WRITER.
  *    Multiple writes at the end of the file must coordinate zp_size updates
  *    to ensure data isn't lost. A compare and swap loop is currently used
  *    to ensure the file size is at least the offset last written.
  * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
  *    read needs to be locked as RL_READER. A check against zp_size can then
  *    be made for reading beyond end of file.
  */
 
 /*
  * Convert between znode pointers and vnode pointers
  */
 #ifdef DEBUG
 static __inline vnode_t *
 ZTOV(znode_t *zp)
 {
 	vnode_t *vp = zp->z_vnode;
 
 	ASSERT(vp == NULL || vp->v_data == NULL || vp->v_data == zp);
 	return (vp);
 }
 static __inline znode_t *
 VTOZ(vnode_t *vp)
 {
 	znode_t *zp = (znode_t *)vp->v_data;
 
 	ASSERT(zp == NULL || zp->z_vnode == NULL || zp->z_vnode == vp);
 	return (zp);
 }
 #else
 #define	ZTOV(ZP)	((ZP)->z_vnode)
 #define	VTOZ(VP)	((znode_t *)(VP)->v_data)
 #endif
 
 /* Called on entry to each ZFS vnode and vfs operation  */
 #define	ZFS_ENTER(zfsvfs) \
 	{ \
 		rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
 		if ((zfsvfs)->z_unmounted) { \
 			ZFS_EXIT(zfsvfs); \
 			return (EIO); \
 		} \
 	}
 
 /* Must be called before exiting the vop */
 #define	ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG)
 
 /* Verifies the znode is valid */
 #define	ZFS_VERIFY_ZP(zp) \
 	if ((zp)->z_sa_hdl == NULL) { \
 		ZFS_EXIT((zp)->z_zfsvfs); \
 		return (EIO); \
 	} \
 
 /*
  * Macros for dealing with dmu_buf_hold
  */
 #define	ZFS_OBJ_HASH(obj_num)	((obj_num) & (ZFS_OBJ_MTX_SZ - 1))
 #define	ZFS_OBJ_MUTEX(zfsvfs, obj_num)	\
 	(&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
 #define	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
 	mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
 #define	ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \
 	mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
 #define	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
 	mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
 
 /* Encode ZFS stored time values from a struct timespec */
 #define	ZFS_TIME_ENCODE(tp, stmp)		\
 {						\
 	(stmp)[0] = (uint64_t)(tp)->tv_sec;	\
 	(stmp)[1] = (uint64_t)(tp)->tv_nsec;	\
 }
 
 /* Decode ZFS stored time values to a struct timespec */
 #define	ZFS_TIME_DECODE(tp, stmp)		\
 {						\
 	(tp)->tv_sec = (time_t)(stmp)[0];		\
 	(tp)->tv_nsec = (long)(stmp)[1];		\
 }
 
 /*
  * Timestamp defines
  */
 #define	ACCESSED		(AT_ATIME)
 #define	STATE_CHANGED		(AT_CTIME)
 #define	CONTENT_MODIFIED	(AT_MTIME | AT_CTIME)
 
 #define	ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
 	if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
 		zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
 
 extern int	zfs_init_fs(zfsvfs_t *, znode_t **);
 extern void	zfs_set_dataprop(objset_t *);
 extern void	zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
     dmu_tx_t *tx);
 extern void	zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
     uint64_t [2], boolean_t);
 extern void	zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
 extern int	zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
 extern void	zfs_znode_init(void);
 extern void	zfs_znode_fini(void);
 extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
 extern int	zfs_rezget(znode_t *);
 extern void	zfs_zinactive(znode_t *);
 extern void	zfs_znode_delete(znode_t *, dmu_tx_t *);
 extern void	zfs_znode_free(znode_t *);
 extern void	zfs_remove_op_tables();
 extern int	zfs_create_op_tables();
 extern dev_t	zfs_cmpldev(uint64_t);
 extern int	zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
 extern int	zfs_get_stats(objset_t *os, nvlist_t *nv);
 extern void	zfs_znode_dmu_fini(znode_t *);
 
 extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *,
     vattr_t *vap);
 extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
     vattr_t *vap);
 extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, char *name, uint64_t foid);
 #define	ZFS_NO_OBJECT	0	/* no object id */
 extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name);
 extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name, char *link);
 extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
 extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, offset_t off, ssize_t len, int ioflag);
 extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, uint64_t off, uint64_t len);
 extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
 #ifndef ZFS_NO_ACL
 extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
     vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
 #endif
 extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
 extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 
 extern zil_get_data_t zfs_get_data;
 extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
 extern int zfsfstype;
 
 #endif /* _KERNEL */
 
 extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_FS_ZFS_ZNODE_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	(revision 296519)
@@ -1,433 +1,434 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_ZIL_H
 #define	_SYS_ZIL_H
 
 #include <sys/types.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct dsl_pool;
 struct dsl_dataset;
 
 /*
  * Intent log format:
  *
  * Each objset has its own intent log.  The log header (zil_header_t)
  * for objset N's intent log is kept in the Nth object of the SPA's
  * intent_log objset.  The log header points to a chain of log blocks,
  * each of which contains log records (i.e., transactions) followed by
  * a log block trailer (zil_trailer_t).  The format of a log record
  * depends on the record (or transaction) type, but all records begin
  * with a common structure that defines the type, length, and txg.
  */
 
 /*
  * Intent log header - this on disk structure holds fields to manage
  * the log.  All fields are 64 bit to easily handle cross architectures.
  */
 typedef struct zil_header {
 	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
 	uint64_t zh_replay_seq;	/* highest replayed sequence number */
 	blkptr_t zh_log;	/* log chain */
 	uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
 	uint64_t zh_flags;	/* header flags */
 	uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
 	uint64_t zh_pad[3];
 } zil_header_t;
 
 /*
  * zh_flags bit settings
  */
 #define	ZIL_REPLAY_NEEDED	0x1	/* replay needed - internal only */
 #define	ZIL_CLAIM_LR_SEQ_VALID	0x2	/* zh_claim_lr_seq field is valid */
 
 /*
  * Log block chaining.
  *
  * Log blocks are chained together. Originally they were chained at the
  * end of the block. For performance reasons the chain was moved to the
  * beginning of the block which allows writes for only the data being used.
  * The older position is supported for backwards compatability.
  *
  * The zio_eck_t contains a zec_cksum which for the intent log is
  * the sequence number of this log block. A seq of 0 is invalid.
  * The zec_cksum is checked by the SPA against the sequence
  * number passed in the blk_cksum field of the blkptr_t
  */
 typedef struct zil_chain {
 	uint64_t zc_pad;
 	blkptr_t zc_next_blk;	/* next block in chain */
 	uint64_t zc_nused;	/* bytes in log block used */
 	zio_eck_t zc_eck;	/* block trailer */
 } zil_chain_t;
 
 #define	ZIL_MIN_BLKSZ	4096ULL
 
 /*
  * The words of a log block checksum.
  */
 #define	ZIL_ZC_GUID_0	0
 #define	ZIL_ZC_GUID_1	1
 #define	ZIL_ZC_OBJSET	2
 #define	ZIL_ZC_SEQ	3
 
 typedef enum zil_create {
 	Z_FILE,
 	Z_DIR,
 	Z_XATTRDIR,
 } zil_create_t;
 
 /*
  * size of xvattr log section.
  * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
  * for create time and a single 64 bit integer for all of the attributes,
  * and 4 64 bit integers (32 bytes) for the scanstamp.
  *
  */
 
 #define	ZIL_XVAT_SIZE(mapsize) \
 	sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
 	(sizeof (uint64_t) * 7)
 
 /*
  * Size of ACL in log.  The ACE data is padded out to properly align
  * on 8 byte boundary.
  */
 
 #define	ZIL_ACE_LENGTH(x)	(roundup(x, sizeof (uint64_t)))
 
 /*
  * Intent log transaction types and record structures
  */
 #define	TX_CREATE		1	/* Create file */
 #define	TX_MKDIR		2	/* Make directory */
 #define	TX_MKXATTR		3	/* Make XATTR directory */
 #define	TX_SYMLINK		4	/* Create symbolic link to a file */
 #define	TX_REMOVE		5	/* Remove file */
 #define	TX_RMDIR		6	/* Remove directory */
 #define	TX_LINK			7	/* Create hard link to a file */
 #define	TX_RENAME		8	/* Rename a file */
 #define	TX_WRITE		9	/* File write */
 #define	TX_TRUNCATE		10	/* Truncate a file */
 #define	TX_SETATTR		11	/* Set file attributes */
 #define	TX_ACL_V0		12	/* Set old formatted ACL */
 #define	TX_ACL			13	/* Set ACL */
 #define	TX_CREATE_ACL		14	/* create with ACL */
 #define	TX_CREATE_ATTR		15	/* create + attrs */
 #define	TX_CREATE_ACL_ATTR 	16	/* create with ACL + attrs */
 #define	TX_MKDIR_ACL		17	/* mkdir with ACL */
 #define	TX_MKDIR_ATTR		18	/* mkdir with attr */
 #define	TX_MKDIR_ACL_ATTR	19	/* mkdir with ACL + attrs */
 #define	TX_WRITE2		20	/* dmu_sync EALREADY write */
 #define	TX_MAX_TYPE		21	/* Max transaction type */
 
 /*
  * The transactions for mkdir, symlink, remove, rmdir, link, and rename
  * may have the following bit set, indicating the original request
  * specified case-insensitive handling of names.
  */
 #define	TX_CI	((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
 
 /*
  * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
  * out of order.  For convenience in the code, all such records must have
  * lr_foid at the same offset.
  */
 #define	TX_OOO(txtype)			\
 	((txtype) == TX_WRITE ||	\
 	(txtype) == TX_TRUNCATE ||	\
 	(txtype) == TX_SETATTR ||	\
 	(txtype) == TX_ACL_V0 ||	\
 	(txtype) == TX_ACL ||		\
 	(txtype) == TX_WRITE2)
 
 /*
  * Format of log records.
  * The fields are carefully defined to allow them to be aligned
  * and sized the same on sparc & intel architectures.
  * Each log record has a common structure at the beginning.
  *
  * The log record on disk (lrc_seq) holds the sequence number of all log
  * records which is used to ensure we don't replay the same record.
  */
 typedef struct {			/* common log record header */
 	uint64_t	lrc_txtype;	/* intent log transaction type */
 	uint64_t	lrc_reclen;	/* transaction record length */
 	uint64_t	lrc_txg;	/* dmu transaction group number */
 	uint64_t	lrc_seq;	/* see comment above */
 } lr_t;
 
 /*
  * Common start of all out-of-order record types (TX_OOO() above).
  */
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* object id */
 } lr_ooo_t;
 
 /*
  * Handle option extended vattr attributes.
  *
  * Whenever new attributes are added the version number
  * will need to be updated as will code in
  * zfs_log.c and zfs_replay.c
  */
 typedef struct {
 	uint32_t	lr_attr_masksize; /* number of elements in array */
 	uint32_t	lr_attr_bitmap; /* First entry of array */
 	/* remainder of array and any additional fields */
 } lr_attr_t;
 
 /*
  * log record for creates without optional ACL.
  * This log record does support optional xvattr_t attributes.
  */
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_doid;	/* object id of directory */
 	uint64_t	lr_foid;	/* object id of created file object */
 	uint64_t	lr_mode;	/* mode of object */
 	uint64_t	lr_uid;		/* uid of object */
 	uint64_t	lr_gid;		/* gid of object */
 	uint64_t	lr_gen;		/* generation (txg of creation) */
 	uint64_t	lr_crtime[2];	/* creation time */
 	uint64_t	lr_rdev;	/* rdev of object to create */
 	/* name of object to create follows this */
 	/* for symlinks, link content follows name */
 	/* for creates with xvattr data, the name follows the xvattr info */
 } lr_create_t;
 
 /*
  * FUID ACL record will be an array of ACEs from the original ACL.
  * If this array includes ephemeral IDs, the record will also include
  * an array of log-specific FUIDs to replace the ephemeral IDs.
  * Only one copy of each unique domain will be present, so the log-specific
  * FUIDs will use an index into a compressed domain table.  On replay this
  * information will be used to construct real FUIDs (and bypass idmap,
  * since it may not be available).
  */
 
 /*
  * Log record for creates with optional ACL
  * This log record is also used for recording any FUID
  * information needed for replaying the create.  If the
  * file doesn't have any actual ACEs then the lr_aclcnt
  * would be zero.
  *
  * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
  * If create is also setting xvattr's, then acl data follows xvattr.
  * If ACE FUIDs are needed then they will follow the xvattr_t.  Following
  * the FUIDs will be the domain table information.  The FUIDs for the owner
  * and group will be in lr_create.  Name follows ACL data.
  */
 typedef struct {
 	lr_create_t	lr_create;	/* common create portion */
 	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
 	uint64_t	lr_domcnt;	/* number of unique domains */
 	uint64_t	lr_fuidcnt;	/* number of real fuids */
 	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
 	uint64_t	lr_acl_flags;	/* ACL flags */
 } lr_acl_create_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_doid;	/* obj id of directory */
 	/* name of object to remove follows this */
 } lr_remove_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_doid;	/* obj id of directory */
 	uint64_t	lr_link_obj;	/* obj id of link */
 	/* name of object to link follows this */
 } lr_link_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_sdoid;	/* obj id of source directory */
 	uint64_t	lr_tdoid;	/* obj id of target directory */
 	/* 2 strings: names of source and destination follow this */
 } lr_rename_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* file object to write */
 	uint64_t	lr_offset;	/* offset to write to */
 	uint64_t	lr_length;	/* user data length to write */
 	uint64_t	lr_blkoff;	/* no longer used */
 	blkptr_t	lr_blkptr;	/* spa block pointer for replay */
 	/* write data will follow for small writes */
 } lr_write_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* object id of file to truncate */
 	uint64_t	lr_offset;	/* offset to truncate from */
 	uint64_t	lr_length;	/* length to truncate */
 } lr_truncate_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* file object to change attributes */
 	uint64_t	lr_mask;	/* mask of attributes to set */
 	uint64_t	lr_mode;	/* mode to set */
 	uint64_t	lr_uid;		/* uid to set */
 	uint64_t	lr_gid;		/* gid to set */
 	uint64_t	lr_size;	/* size to set */
 	uint64_t	lr_atime[2];	/* access time */
 	uint64_t	lr_mtime[2];	/* modification time */
 	/* optional attribute lr_attr_t may be here */
 } lr_setattr_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* obj id of file */
 	uint64_t	lr_aclcnt;	/* number of acl entries */
 	/* lr_aclcnt number of ace_t entries follow this */
 } lr_acl_v0_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* obj id of file */
 	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
 	uint64_t	lr_domcnt;	/* number of unique domains */
 	uint64_t	lr_fuidcnt;	/* number of real fuids */
 	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
 	uint64_t	lr_acl_flags;	/* ACL flags */
 	/* lr_acl_bytes number of variable sized ace's follows */
 } lr_acl_t;
 
 /*
  * ZIL structure definitions, interface function prototype and globals.
  */
 
 /*
  * Writes are handled in three different ways:
  *
  * WR_INDIRECT:
  *    In this mode, if we need to commit the write later, then the block
  *    is immediately written into the file system (using dmu_sync),
  *    and a pointer to the block is put into the log record.
  *    When the txg commits the block is linked in.
  *    This saves additionally writing the data into the log record.
  *    There are a few requirements for this to occur:
  *	- write is greater than zfs/zvol_immediate_write_sz
  *	- not using slogs (as slogs are assumed to always be faster
  *	  than writing into the main pool)
  *	- the write occupies only one block
  * WR_COPIED:
  *    If we know we'll immediately be committing the
  *    transaction (FSYNC or FDSYNC), the we allocate a larger
  *    log record here for the data and copy the data in.
  * WR_NEED_COPY:
  *    Otherwise we don't allocate a buffer, and *if* we need to
  *    flush the write later then a buffer is allocated and
  *    we retrieve the data using the dmu.
  */
 typedef enum {
 	WR_INDIRECT,	/* indirect - a large write (dmu_sync() data */
 			/* and put blkptr in log, rather than actual data) */
 	WR_COPIED,	/* immediate - data is copied into lr_write_t */
 	WR_NEED_COPY,	/* immediate - data needs to be copied if pushed */
 	WR_NUM_STATES	/* number of states */
 } itx_wr_state_t;
 
 typedef struct itx {
 	list_node_t	itx_node;	/* linkage on zl_itx_list */
 	void		*itx_private;	/* type-specific opaque data */
 	itx_wr_state_t	itx_wr_state;	/* write state */
 	uint8_t		itx_sync;	/* synchronous transaction */
 	uint64_t	itx_sod;	/* record size on disk */
 	uint64_t	itx_oid;	/* object id */
 	lr_t		itx_lr;		/* common part of log record */
 	/* followed by type-specific part of lr_xx_t and its immediate data */
 } itx_t;
 
 typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
     uint64_t txg);
 typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t();
 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
 
 extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
 
 extern void	zil_init(void);
 extern void	zil_fini(void);
 
 extern zilog_t	*zil_alloc(objset_t *os, zil_header_t *zh_phys);
 extern void	zil_free(zilog_t *zilog);
 
 extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
 extern void	zil_close(zilog_t *zilog);
 
 extern void	zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *replay_func[TX_MAX_TYPE]);
 extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
 extern void	zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
 extern itx_t	*zil_itx_create(uint64_t txtype, size_t lrsize);
 extern void	zil_itx_destroy(itx_t *itx);
 extern void	zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 
 extern void	zil_commit(zilog_t *zilog, uint64_t oid);
 
 extern int	zil_vdev_offline(const char *osname, void *txarg);
 extern int	zil_claim(struct dsl_pool *dp,
     struct dsl_dataset *ds, void *txarg);
 extern int 	zil_check_log_chain(struct dsl_pool *dp,
     struct dsl_dataset *ds, void *tx);
 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_clean(zilog_t *zilog, uint64_t synced_txg);
 
 extern int	zil_suspend(const char *osname, void **cookiep);
 extern void	zil_resume(void *cookie);
 
 extern void	zil_add_block(zilog_t *zilog, const blkptr_t *bp);
 extern int	zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
 
 extern void	zil_set_sync(zilog_t *zilog, uint64_t syncval);
 
 extern void	zil_set_logbias(zilog_t *zilog, uint64_t slogval);
 
 extern int zil_replay_disable;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_ZIL_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	(revision 296519)
@@ -1,149 +1,150 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_ZIL_IMPL_H
 #define	_SYS_ZIL_IMPL_H
 
 #include <sys/zil.h>
 #include <sys/dmu_objset.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Log write buffer.
  */
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
 	zio_t		*lwb_zio;	/* zio for this buffer */
 	dmu_tx_t	*lwb_tx;	/* tx for log block allocation */
 	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
 	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
 } lwb_t;
 
 /*
  * Intent log transaction lists
  */
 typedef struct itxs {
 	list_t		i_sync_list;	/* list of synchronous itxs */
 	avl_tree_t	i_async_tree;	/* tree of foids for async itxs */
 } itxs_t;
 
 typedef struct itxg {
 	kmutex_t	itxg_lock;	/* lock for this structure */
 	uint64_t	itxg_txg;	/* txg for this chain */
 	uint64_t	itxg_sod;	/* total size on disk for this txg */
 	itxs_t		*itxg_itxs;	/* sync and async itxs */
 } itxg_t;
 
 /* for async nodes we build up an AVL tree of lists of async itxs per file */
 typedef struct itx_async_node {
 	uint64_t	ia_foid;	/* file object id */
 	list_t		ia_list;	/* list of async itxs for this foid */
 	avl_node_t	ia_node;	/* AVL tree linkage */
 } itx_async_node_t;
 
 /*
  * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
  * we've touched so we know which ones need a write cache flush at the end.
  */
 typedef struct zil_vdev_node {
 	uint64_t	zv_vdev;	/* vdev to be flushed */
 	avl_node_t	zv_node;	/* AVL tree linkage */
 } zil_vdev_node_t;
 
 #define	ZIL_PREV_BLKS 16
 
 /*
  * Stable storage intent log management structure.  One per dataset.
  */
 struct zilog {
 	kmutex_t	zl_lock;	/* protects most zilog_t fields */
 	struct dsl_pool	*zl_dmu_pool;	/* DSL pool */
 	spa_t		*zl_spa;	/* handle for read/write log */
 	const zil_header_t *zl_header;	/* log header buffer */
 	objset_t	*zl_os;		/* object set we're logging */
 	zil_get_data_t	*zl_get_data;	/* callback to get object content */
 	zio_t		*zl_root_zio;	/* log writer root zio */
 	uint64_t	zl_lr_seq;	/* on-disk log record sequence number */
 	uint64_t	zl_commit_lr_seq; /* last committed on-disk lr seq */
 	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
 	uint64_t	zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
 	uint64_t	zl_replaying_seq; /* current replay seq number */
 	uint32_t	zl_suspend;	/* log suspend count */
 	kcondvar_t	zl_cv_writer;	/* log writer thread completion */
 	kcondvar_t	zl_cv_suspend;	/* log suspend completion */
 	uint8_t		zl_suspending;	/* log is currently suspending */
 	uint8_t		zl_keep_first;	/* keep first log block in destroy */
 	uint8_t		zl_replay;	/* replaying records while set */
 	uint8_t		zl_stop_sync;	/* for debugging */
 	uint8_t		zl_writer;	/* boolean: write setup in progress */
 	uint8_t		zl_logbias;	/* latency or throughput */
 	uint8_t		zl_sync;	/* synchronous or asynchronous */
 	int		zl_parse_error;	/* last zil_parse() error */
 	uint64_t	zl_parse_blk_seq; /* highest blk seq on last parse */
 	uint64_t	zl_parse_lr_seq; /* highest lr seq on last parse */
 	uint64_t	zl_parse_blk_count; /* number of blocks parsed */
 	uint64_t	zl_parse_lr_count; /* number of log records parsed */
 	uint64_t	zl_next_batch;	/* next batch number */
 	uint64_t	zl_com_batch;	/* committed batch number */
 	kcondvar_t	zl_cv_batch[2];	/* batch condition variables */
 	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
 	list_t		zl_itx_commit_list; /* itx list to be committed */
 	uint64_t	zl_itx_list_sz;	/* total size of records on list */
 	uint64_t	zl_cur_used;	/* current commit log size used */
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
 	avl_tree_t	zl_vdev_tree;	/* vdevs to flush in zil_commit() */
 	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
 	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
 	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
 	zil_header_t	zl_old_header;	/* debugging aid */
 	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
 	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
 	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
 };
 
 typedef struct zil_bp_node {
 	dva_t		zn_dva;
 	avl_node_t	zn_node;
 } zil_bp_node_t;
 
 #define	ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_ZIL_IMPL_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	(revision 296519)
@@ -1,3546 +1,3547 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/vdev_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
 #include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/trim_map.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
 
 /*
  * Virtual device management.
  */
 
 /*
  * The limit for ZFS to automatically increase a top-level vdev's ashift
  * from logical ashift to physical ashift.
  *
  * Example: one or more 512B emulation child vdevs
  *          child->vdev_ashift = 9 (512 bytes)
  *          child->vdev_physical_ashift = 12 (4096 bytes)
  *          zfs_max_auto_ashift = 11 (2048 bytes)
  *          zfs_min_auto_ashift = 9 (512 bytes)
  *
  * On pool creation or the addition of a new top-level vdev, ZFS will
  * increase the ashift of the top-level vdev to 2048 as limited by
  * zfs_max_auto_ashift.
  *
  * Example: one or more 512B emulation child vdevs
  *          child->vdev_ashift = 9 (512 bytes)
  *          child->vdev_physical_ashift = 12 (4096 bytes)
  *          zfs_max_auto_ashift = 13 (8192 bytes)
  *          zfs_min_auto_ashift = 9 (512 bytes)
  *
  * On pool creation or the addition of a new top-level vdev, ZFS will
  * increase the ashift of the top-level vdev to 4096 to match the
  * max vdev_physical_ashift.
  *
  * Example: one or more 512B emulation child vdevs
  *          child->vdev_ashift = 9 (512 bytes)
  *          child->vdev_physical_ashift = 9 (512 bytes)
  *          zfs_max_auto_ashift = 13 (8192 bytes)
  *          zfs_min_auto_ashift = 12 (4096 bytes)
  *
  * On pool creation or the addition of a new top-level vdev, ZFS will
  * increase the ashift of the top-level vdev to 4096 to match the
  * zfs_min_auto_ashift.
  */
 static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
 static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;
 
 static int
 sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t val;
 	int err;
 
 	val = zfs_max_auto_ashift;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift)
 		return (EINVAL);
 
 	zfs_max_auto_ashift = val;
 
 	return (0);
 }
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
     CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
     sysctl_vfs_zfs_max_auto_ashift, "QU",
     "Max ashift used when optimising for logical -> physical sectors size on "
     "new top-level vdevs.");
 
 static int
 sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t val;
 	int err;
 
 	val = zfs_min_auto_ashift;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift)
 		return (EINVAL);
 
 	zfs_min_auto_ashift = val;
 
 	return (0);
 }
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
     CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
     sysctl_vfs_zfs_min_auto_ashift, "QU",
     "Min ashift used when creating new top-level vdevs.");
 
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
 #ifdef _KERNEL
 	&vdev_geom_ops,
 #else
 	&vdev_disk_ops,
 #endif
 	&vdev_file_ops,
 	&vdev_missing_ops,
 	&vdev_hole_ops,
 	NULL
 };
 
 
 /*
  * When a vdev is added, it will be divided into approximately (but no
  * more than) this number of metaslabs.
  */
 int metaslabs_per_vdev = 200;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN,
     &metaslabs_per_vdev, 0,
     "When a vdev is added, how many metaslabs the vdev should be divided into");
 
 /*
  * Given a vdev type, return the appropriate ops vector.
  */
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
 	vdev_ops_t *ops, **opspp;
 
 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 		if (strcmp(ops->vdev_op_type, type) == 0)
 			break;
 
 	return (ops);
 }
 
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
 vdev_default_asize(vdev_t *vd, uint64_t psize)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
 		asize = MAX(asize, csize);
 	}
 
 	return (asize);
 }
 
 /*
  * Get the minimum allocatable size. We define the allocatable size as
  * the vdev's asize rounded to the nearest metaslab. This allows us to
  * replace or attach devices which don't have the same physical size but
  * can still satisfy the same number of allocations.
  */
 uint64_t
 vdev_get_min_asize(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	/*
 	 * If our parent is NULL (inactive spare or cache) or is the root,
 	 * just return our own asize.
 	 */
 	if (pvd == NULL)
 		return (vd->vdev_asize);
 
 	/*
 	 * The top-level vdev just returns the allocatable size rounded
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
 		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
 
 	/*
 	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
 	 * so each child must provide at least 1/Nth of its asize.
 	 */
 	if (pvd->vdev_ops == &vdev_raidz_ops)
 		return (pvd->vdev_min_asize / pvd->vdev_children);
 
 	return (pvd->vdev_min_asize);
 }
 
 void
 vdev_set_min_asize(vdev_t *vd)
 {
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }
 
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev < rvd->vdev_children) {
 		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
 	}
 
 	return (NULL);
 }
 
 vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	vdev_t *mvd;
 
 	if (vd->vdev_guid == guid)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 static int
 vdev_count_leaves_impl(vdev_t *vd)
 {
 	int n = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		return (1);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		n += vdev_count_leaves_impl(vd->vdev_child[c]);
 
 	return (n);
 }
 
 int
 vdev_count_leaves(spa_t *spa)
 {
 	return (vdev_count_leaves_impl(spa->spa_root_vdev));
 }
 
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
 	spa_t *spa = cvd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 
 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 	newsize = pvd->vdev_children * sizeof (vdev_t *);
 
 	newchild = kmem_zalloc(newsize, KM_SLEEP);
 	if (pvd->vdev_child != NULL) {
 		bcopy(pvd->vdev_child, newchild, oldsize);
 		kmem_free(pvd->vdev_child, oldsize);
 	}
 
 	pvd->vdev_child = newchild;
 	pvd->vdev_child[id] = cvd;
 
 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 }
 
 void
 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 {
 	int c;
 	uint_t id = cvd->vdev_id;
 
 	ASSERT(cvd->vdev_parent == pvd);
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id < pvd->vdev_children);
 	ASSERT(pvd->vdev_child[id] == cvd);
 
 	pvd->vdev_child[id] = NULL;
 	cvd->vdev_parent = NULL;
 
 	for (c = 0; c < pvd->vdev_children; c++)
 		if (pvd->vdev_child[c])
 			break;
 
 	if (c == pvd->vdev_children) {
 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 		pvd->vdev_child = NULL;
 		pvd->vdev_children = 0;
 	}
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 }
 
 /*
  * Remove any holes in the child array.
  */
 void
 vdev_compact_children(vdev_t *pvd)
 {
 	vdev_t **newchild, *cvd;
 	int oldc = pvd->vdev_children;
 	int newc;
 
 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	for (int c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
 			newc++;
 
 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
 
 	for (int c = newc = 0; c < oldc; c++) {
 		if ((cvd = pvd->vdev_child[c]) != NULL) {
 			newchild[newc] = cvd;
 			cvd->vdev_id = newc++;
 		}
 	}
 
 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 	pvd->vdev_child = newchild;
 	pvd->vdev_children = newc;
 }
 
 /*
  * Allocate and minimally initialize a vdev_t.
  */
 vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
 
 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
 		spa->spa_load_guid = spa_generate_guid(NULL);
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
 			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
 			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
 
 	vd->vdev_spa = spa;
 	vd->vdev_id = id;
 	vd->vdev_guid = guid;
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
 		    &vd->vdev_dtl_lock);
 	}
 	txg_list_create(&vd->vdev_ms_list,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
 	vdev_queue_init(vd);
 	vdev_cache_init(vd);
 
 	return (vd);
 }
 
 /*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
  */
 int
 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
 	vdev_ops_t *ops;
 	char *type;
 	uint64_t guid = 0, islog, nparity;
 	vdev_t *vd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((ops = vdev_getops(type)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If this is a load, get the vdev guid from the nvlist.
 	 * Otherwise, vdev_alloc_common() will generate one for us.
 	 */
 	if (alloctype == VDEV_ALLOC_LOAD) {
 		uint64_t label_id;
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 		    label_id != id)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The first allocated vdev must be of type 'root'.
 	 */
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Determine whether we're a log vdev.
 	 */
 	islog = 0;
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (SET_ERROR(ENOTSUP));
 
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the nparity property for RAID-Z vdevs.
 	 */
 	nparity = -1ULL;
 	if (ops == &vdev_raidz_ops) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 		    &nparity) == 0) {
 			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 				return (SET_ERROR(EINVAL));
 			/*
 			 * Previous versions could only support 1 or 2 parity
 			 * device.
 			 */
 			if (nparity > 1 &&
 			    spa_version(spa) < SPA_VERSION_RAIDZ2)
 				return (SET_ERROR(ENOTSUP));
 			if (nparity > 2 &&
 			    spa_version(spa) < SPA_VERSION_RAIDZ3)
 				return (SET_ERROR(ENOTSUP));
 		} else {
 			/*
 			 * We require the parity to be specified for SPAs that
 			 * support multiple parity levels.
 			 */
 			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 				return (SET_ERROR(EINVAL));
 			/*
 			 * Otherwise, we default to 1 parity device for RAID-Z.
 			 */
 			nparity = 1;
 		}
 	} else {
 		nparity = 0;
 	}
 	ASSERT(nparity != -1ULL);
 
 	vd = vdev_alloc_common(spa, id, guid, ops);
 
 	vd->vdev_islog = islog;
 	vd->vdev_nparity = nparity;
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
 		vd->vdev_path = spa_strdup(vd->vdev_path);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 	    &vd->vdev_physpath) == 0)
 		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
 		vd->vdev_fru = spa_strdup(vd->vdev_fru);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
 	 * as -1.
 	 */
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;
 
 	/*
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 
 	/*
 	 * Retrieve the vdev creation time.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 	    &vd->vdev_crtxg);
 
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (parent && !parent->vdev_parent &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 		    &vd->vdev_removing);
 	}
 
 	if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
 		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 		    alloctype == VDEV_ALLOC_ADD ||
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		vd->vdev_mg = metaslab_group_create(islog ?
 		    spa_log_class(spa) : spa_normal_class(spa), vd);
 	}
 
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
 
 		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 			uint64_t spare = 0;
 
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 			    &spare) == 0 && spare)
 				spa_spare_add(vd);
 		}
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 		    &vd->vdev_resilver_txg);
 
 		/*
 		 * When importing a pool, we want to ignore the persistent fault
 		 * state, as the diagnosis made on another system may not be
 		 * valid in the current context.  Local vdevs will
 		 * remain in the faulted state.
 		 */
 		if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
 
 			if (vd->vdev_faulted || vd->vdev_degraded) {
 				char *aux;
 
 				vd->vdev_label_aux =
 				    VDEV_AUX_ERR_EXCEEDED;
 				if (nvlist_lookup_string(nv,
 				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 				    strcmp(aux, "external") == 0)
 					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 			}
 		}
 	}
 
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
 	vdev_add_child(parent, vd);
 
 	*vdp = vd;
 
 	return (0);
 }
 
 void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
 	 */
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_free(vd->vdev_child[c]);
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Remove this vdev from its parent's child list.
 	 */
 	vdev_remove_child(vd->vdev_parent, vd);
 
 	ASSERT(vd->vdev_parent == NULL);
 
 	/*
 	 * Clean up vdev structure.
 	 */
 	vdev_queue_fini(vd);
 	vdev_cache_fini(vd);
 
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
 	if (vd->vdev_devid)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
 	if (vd->vdev_fru)
 		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 		range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
 
 	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
  * Transfer top-level vdev state from svd to tvd.
  */
 static void
 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 {
 	spa_t *spa = svd->vdev_spa;
 	metaslab_t *msp;
 	vdev_t *vd;
 	int t;
 
 	ASSERT(tvd == tvd->vdev_top);
 
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
 
 	svd->vdev_ms_array = 0;
 	svd->vdev_ms_shift = 0;
 	svd->vdev_ms_count = 0;
 
 	if (tvd->vdev_mg)
 		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 	tvd->vdev_mg = svd->vdev_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
 	svd->vdev_mg = NULL;
 	svd->vdev_ms = NULL;
 
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
 
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 
 	svd->vdev_stat.vs_alloc = 0;
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 	}
 
 	if (list_link_active(&svd->vdev_config_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (list_link_active(&svd->vdev_state_dirty_node)) {
 		vdev_state_clean(svd);
 		vdev_state_dirty(tvd);
 	}
 
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
 
 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
 }
 
 static void
 vdev_top_update(vdev_t *tvd, vdev_t *vd)
 {
 	if (vd == NULL)
 		return;
 
 	vd->vdev_top = tvd;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_top_update(tvd, vd->vdev_child[c]);
 }
 
 /*
  * Add a mirror/replacing vdev above an existing vdev.
  */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 {
 	spa_t *spa = cvd->vdev_spa;
 	vdev_t *pvd = cvd->vdev_parent;
 	vdev_t *mvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
 	mvd->vdev_asize = cvd->vdev_asize;
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
 	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
 	cvd->vdev_id = mvd->vdev_children;
 	vdev_add_child(mvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (mvd == mvd->vdev_top)
 		vdev_top_transfer(cvd, mvd);
 
 	return (mvd);
 }
 
 /*
  * Remove a 1-way mirror/replacing vdev from the tree.
  */
 void
 vdev_remove_parent(vdev_t *cvd)
 {
 	vdev_t *mvd = cvd->vdev_parent;
 	vdev_t *pvd = mvd->vdev_parent;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
 	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
 	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
 
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
 	if (mvd->vdev_top == mvd) {
 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 		cvd->vdev_orig_guid = cvd->vdev_guid;
 		cvd->vdev_guid += guid_delta;
 		cvd->vdev_guid_sum += guid_delta;
 	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (cvd == cvd->vdev_top)
 		vdev_top_transfer(mvd, cvd);
 
 	ASSERT(mvd->vdev_children == 0);
 	vdev_free(mvd);
 }
 
 int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t m;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 
 	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	/*
 	 * This vdev is not being allocated from yet or is a hole.
 	 */
 	if (vd->vdev_ms_shift == 0)
 		return (0);
 
 	ASSERT(!vd->vdev_ishole);
 
 	/*
 	 * Compute the raidz-deflation ratio.  Note, we hard-code
 	 * in 128k (1 << 17) because it is the "typical" blocksize.
 	 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
 	 * otherwise it would inconsistently account for existing bp's.
 	 */
 	vd->vdev_deflate_ratio = (1 << 17) /
 	    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
 
 	ASSERT(oldc <= newc);
 
 	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (oldc != 0) {
 		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
 		kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 	}
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
 
 	for (m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 
 		if (txg == 0) {
 			error = dmu_read(mos, vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error)
 				return (error);
 		}
 
 		error = metaslab_init(vd->vdev_mg, m, object, txg,
 		    &(vd->vdev_ms[m]));
 		if (error)
 			return (error);
 	}
 
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
 	/*
 	 * If the vdev is being removed we don't activate
 	 * the metaslabs since we want to ensure that no new
 	 * allocations are performed on this device.
 	 */
 	if (oldc == 0 && !vd->vdev_removing)
 		metaslab_group_activate(vd->vdev_mg);
 
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	return (0);
 }
 
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
 	uint64_t m;
 	uint64_t count = vd->vdev_ms_count;
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_passivate(vd->vdev_mg);
 		for (m = 0; m < count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
 			if (msp != NULL)
 				metaslab_fini(msp);
 		}
 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 	}
 }
 
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
 
 	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_data,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 		} else {
 			zio_buf_free(zio->io_data, zio->io_size);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		zio_buf_free(zio->io_data, zio->io_size);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
 		zio_t *pio;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 
 		if (vdev_readable(vd) &&
 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, 0, 0);
 			zio->io_error = SET_ERROR(ENXIO);
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
 		ASSERT(vd->vdev_probe_zio == zio);
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
 		while ((pio = zio_walk_parents(zio)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);
 
 		kmem_free(vps, sizeof (*vps));
 	}
 }
 
 /*
  * Determine whether this device is accessible.
  *
  * Read and write to several known locations: the pad regions of each
  * vdev label but the first, which we leave alone in case it contains
  * a VTOC.
  */
 zio_t *
 vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_probe_stats_t *vps = NULL;
 	zio_t *pio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * Don't probe the probe.
 	 */
 	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
 		return (NULL);
 
 	/*
 	 * To prevent 'probe storms' when a device fails, we create
 	 * just one probe i/o at a time.  All zios that want to probe
 	 * this vdev will become parents of the probe io.
 	 */
 	mutex_enter(&vd->vdev_probe_lock);
 
 	if ((pio = vd->vdev_probe_zio) == NULL) {
 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
 		    ZIO_FLAG_TRYHARD;
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
 			 * vdev_cant_read and vdev_cant_write can only
 			 * transition from TRUE to FALSE when we have the
 			 * SCL_ZIO lock as writer; otherwise they can only
 			 * transition from FALSE to TRUE.  This ensures that
 			 * any zio looking at these values can assume that
 			 * failures persist for the life of the I/O.  That's
 			 * important because when a device has intermittent
 			 * connectivity problems, we want to ensure that
 			 * they're ascribed to the device (ENXIO) and not
 			 * the zio (EIO).
 			 *
 			 * Since we hold SCL_ZIO as writer here, clear both
 			 * values so the probe can reevaluate from first
 			 * principles.
 			 */
 			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
 			vd->vdev_cant_read = B_FALSE;
 			vd->vdev_cant_write = B_FALSE;
 		}
 
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 
 		/*
 		 * We can't change the vdev state in this context, so we
 		 * kick off an async task to do it on our behalf.
 		 */
 		if (zio != NULL) {
 			vd->vdev_probe_wanted = B_TRUE;
 			spa_async_request(spa, SPA_ASYNC_PROBE);
 		}
 	}
 
 	if (zio != NULL)
 		zio_add_child(zio, pio);
 
 	mutex_exit(&vd->vdev_probe_lock);
 
 	if (vps == NULL) {
 		ASSERT(zio != NULL);
 		return (NULL);
 	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
 		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
 		    offsetof(vdev_label_t, vl_pad2)),
 		    VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
 	if (zio == NULL)
 		return (pio);
 
 	zio_nowait(pio);
 	return (NULL);
 }
 
 static void
 vdev_open_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_open_thread = curthread;
 	vd->vdev_open_error = vdev_open(vd);
 	vd->vdev_open_thread = NULL;
 }
 
 boolean_t
 vdev_uses_zvols(vdev_t *vd)
 {
 	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
 	    strlen(ZVOL_DIR)) == 0)
 		return (B_TRUE);
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_uses_zvols(vd->vdev_child[c]))
 			return (B_TRUE);
 	return (B_FALSE);
 }
 
 void
 vdev_open_children(vdev_t *vd)
 {
 	taskq_t *tq;
 	int children = vd->vdev_children;
 
 	/*
 	 * in order to handle pools on top of zvols, do the opens
 	 * in a single thread so that the same thread holds the
 	 * spa_namespace_lock
 	 */
 	if (B_TRUE || vdev_uses_zvols(vd)) {
 		for (int c = 0; c < children; c++)
 			vd->vdev_child[c]->vdev_open_error =
 			    vdev_open(vd->vdev_child[c]);
 		return;
 	}
 	tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 
 	for (int c = 0; c < children; c++)
 		VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
 		    TQ_SLEEP) != 0);
 
 	taskq_destroy(tq);
 }
 
 /*
  * Prepare a virtual device for access.
  */
 int
 vdev_open(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 	uint64_t osize = 0;
 	uint64_t max_osize = 0;
 	uint64_t asize, max_asize, psize;
 	uint64_t logical_ashift = 0;
 	uint64_t physical_ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_notrim = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
 	 * If this vdev is not removed, check its fault status.  If it's
 	 * faulted, bail out of the open.
 	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (SET_ERROR(ENXIO));
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
 	    &logical_ashift, &physical_ashift);
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
 	 * the vdev on error.
 	 */
 	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, NULL, ENXIO);
 
 	if (error) {
 		if (vd->vdev_removed &&
 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
 			vd->vdev_removed = B_FALSE;
 
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    vd->vdev_stat.vs_aux);
 		return (error);
 	}
 
 	vd->vdev_removed = B_FALSE;
 
 	/*
 	 * Recheck the faulted flag now that we have confirmed that
 	 * the vdev is accessible.  If we're faulted, bail.
 	 */
 	if (vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vd->vdev_degraded) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
 	/*
 	 * For hole or missing vdevs we just return success.
 	 */
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
 	if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf)
 		trim_map_create(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_NONE);
 			break;
 		}
 	}
 
 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
 	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
 		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
 		    VDEV_LABEL_END_SIZE);
 	} else {
 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = 0;
 		asize = osize;
 		max_asize = max_osize;
 	}
 
 	vd->vdev_psize = psize;
 
 	/*
 	 * Make sure the allocatable size hasn't shrunk.
 	 */
 	if (asize < vd->vdev_min_asize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		return (SET_ERROR(EINVAL));
 	}
 
 	vd->vdev_physical_ashift =
 	    MAX(physical_ashift, vd->vdev_physical_ashift);
 	vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
 	vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
 
 	if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_ASHIFT_TOO_BIG);
 		return (EINVAL);
 	}
 
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
 		 * For testing purposes, a higher ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
 		vd->vdev_max_asize = max_asize;
 	} else {
 		/*
 		 * Make sure the alignment requirement hasn't increased.
 		 */
 		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
 		    vd->vdev_ops->vdev_op_leaf) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
 		}
 		vd->vdev_max_asize = max_asize;
 	}
 
 	/*
 	 * If all children are healthy and the asize has increased,
 	 * then we've experienced dynamic LUN growth.  If automatic
 	 * expansion is enabled then use the additional space.
 	 */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
 	    (vd->vdev_expanding || spa->spa_autoexpand))
 		vd->vdev_asize = asize;
 
 	vdev_set_min_asize(vd);
 
 	/*
 	 * Ensure we can issue some IO before declaring the
 	 * vdev open for business.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
 	/*
 	 * Track the min and max ashift values for normal data devices.
 	 */
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    !vd->vdev_islog && vd->vdev_aux == NULL) {
 		if (vd->vdev_ashift > spa->spa_max_ashift)
 			spa->spa_max_ashift = vd->vdev_ashift;
 		if (vd->vdev_ashift < spa->spa_min_ashift)
 			spa->spa_min_ashift = vd->vdev_ashift;
 	}
 
 	/*
 	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
 	 * resilver.  But don't do this if we are doing a reopen for a scrub,
 	 * since this would just restart the scrub we are already doing.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
 	    vdev_resilver_needed(vd, NULL, NULL))
 		spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 	return (0);
 }
 
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents.  This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
  * If 'strict' is false ignore the spa guid check. This is necessary because
  * if the machine crashed during a re-guid the new guid might have been written
  * to all of the vdev labels, but not the cached config. The strict check
  * will be performed when the pool is opened again using the mos config.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
 vdev_validate(vdev_t *vd, boolean_t strict)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
 	uint64_t guid = 0, top_guid;
 	uint64_t state;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_validate(vd->vdev_child[c], strict) != 0)
 			return (SET_ERROR(EBADF));
 
 	/*
 	 * If the device has already failed, or was marked offline, don't do
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 		uint64_t aux_guid = 0;
 		nvlist_t *nvl;
 		uint64_t txg = spa_last_synced_txg(spa) != 0 ?
 		    spa_last_synced_txg(spa) : -1ULL;
 
 		if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (0);
 		}
 
 		/*
 		 * Determine if this vdev has been split off into another
 		 * pool.  If so, then refuse to open it.
 		 */
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
 		    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_SPLIT_POOL);
 			nvlist_free(label);
 			return (0);
 		}
 
 		if (strict && (nvlist_lookup_uint64(label,
 		    ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
 		    guid != spa_guid(spa))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
 		}
 
 		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
 		    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
 		    &aux_guid) != 0)
 			aux_guid = 0;
 
 		/*
 		 * If this vdev just became a top-level vdev because its
 		 * sibling was detached, it will have adopted the parent's
 		 * vdev guid -- but the label may or may not be on disk yet.
 		 * Fortunately, either version of the label will have the
 		 * same top guid, so if we're a top-level vdev, we can
 		 * safely compare to that instead.
 		 *
 		 * If we split this vdev off instead, then we also check the
 		 * original pool's guid.  We don't want to consider the vdev
 		 * corrupt if it is partway through a split operation.
 		 */
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
 		    &guid) != 0 ||
 		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
 		    &top_guid) != 0 ||
 		    ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
 		    (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
 		}
 
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    &state) != 0) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
 		}
 
 		nvlist_free(label);
 
 		/*
 		 * If this is a verbatim import, no need to check the
 		 * state of the pool.
 		 */
 		if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
 		    spa_load_state(spa) == SPA_LOAD_OPEN &&
 		    state != POOL_STATE_ACTIVE)
 			return (SET_ERROR(EBADF));
 
 		/*
 		 * If we were able to open and validate a vdev that was
 		 * previously marked permanently unavailable, clear that state
 		 * now.
 		 */
 		if (vd->vdev_not_present)
 			vd->vdev_not_present = 0;
 	}
 
 	return (0);
 }
 
 /*
  * Close a virtual device.
  */
 void
 vdev_close(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *pvd = vd->vdev_parent;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/*
 	 * If our parent is reopening, then we are as well, unless we are
 	 * going offline.
 	 */
 	if (pvd != NULL && pvd->vdev_reopening)
 		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
 
 	vd->vdev_ops->vdev_op_close(vd);
 
 	vdev_cache_purge(vd);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		trim_map_destroy(vd);
 
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
 	 * it's still faulted.
 	 */
 	vd->vdev_prevstate = vd->vdev_state;
 
 	if (vd->vdev_offline)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
 vdev_hold(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_hold(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_hold(vd);
 }
 
 void
 vdev_rele(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_rele(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_rele(vd);
 }
 
 /*
  * Reopen all interior vdevs and any unopened leaves.  We don't actually
  * reopen leaf vdevs which had previously been opened as they might deadlock
  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
  * If the leaf has never been opened then open it, as usual.
  */
 void
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/* set the reopening flag unless we're taking the vdev offline */
 	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
 	 */
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
 		    vd->vdev_aux == &spa->spa_l2cache &&
 		    !l2arc_vdev_present(vd))
 			l2arc_add_vdev(spa, vd);
 	} else {
 		(void) vdev_validate(vd, B_TRUE);
 	}
 
 	/*
 	 * Reassess parent vdev's health.
 	 */
 	vdev_propagate_state(vd);
 }
 
 int
 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 {
 	int error;
 
 	/*
 	 * Normally, partial opens (e.g. of a mirror) are allowed.
 	 * For a create, however, we want to fail the request if
 	 * there are any components we can't open.
 	 */
 	error = vdev_open(vd);
 
 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
 		vdev_close(vd);
 		return (error ? error : ENXIO);
 	}
 
 	/*
 	 * Recursively load DTLs and initialize all labels.
 	 */
 	if ((error = vdev_dtl_load(vd)) != 0 ||
 	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	/*
 	 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
 	 */
 	vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
 }
 
 /*
  * Maximize performance by inflating the configured ashift for top level
  * vdevs to be as close to the physical ashift as possible while maintaining
  * administrator defined limits and ensuring it doesn't go below the
  * logical ashift.
  */
 void
 vdev_ashift_optimize(vdev_t *vd)
 {
 	if (vd == vd->vdev_top) {
 		if (vd->vdev_ashift < vd->vdev_physical_ashift) {
 			vd->vdev_ashift = MIN(
 			    MAX(zfs_max_auto_ashift, vd->vdev_ashift),
 			    MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
 		} else {
 			/*
 			 * Unusual case where logical ashift > physical ashift
 			 * so we can't cap the calculated ashift based on max
 			 * ashift as that would cause failures.
 			 * We still check if we need to increase it to match
 			 * the min ashift.
 			 */
 			vd->vdev_ashift = MAX(zfs_min_auto_ashift,
 			    vd->vdev_ashift);
 		}
 	}
 }
 
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
 	ASSERT(!vd->vdev_ishole);
 	ASSERT(ISP2(flags));
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
 
 	if (flags & VDD_DTL)
 		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
 
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
 void
 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vdev_dirty(vd->vdev_top, flags, vd, txg);
 }
 
 /*
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
  * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
  *
  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
  *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
  *	txgs that was scrubbed.
  *
  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
  *	persistent errors or just some device being offline.
  *	Unlike the other three, the DTL_OUTAGE map is not generally
  *	maintained; it's only computed when needed, typically to
  *	determine whether a device can be detached.
  *
  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
  * either has the data or it doesn't.
  *
  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
  * if any child is less than fully replicated, then so is its parent.
  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
  * comprising only those txgs which appear in 'maxfaults' or more children;
  * those are the txgs we don't have enough replication to read.  For example,
  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
  * two child DTL_MISSING maps.
  *
  * It should be clear from the above that to compute the DTLs and outage maps
  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
  * Therefore, that is all we keep on disk.  When loading the pool, or after
  * a configuration change, we generate all other DTLs from first principles.
  */
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(rt->rt_lock);
 	if (!range_tree_contains(rt, txg, size))
 		range_tree_add(rt, txg, size);
 	mutex_exit(rt->rt_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	mutex_enter(rt->rt_lock);
 	if (range_tree_space(rt) != 0)
 		dirty = range_tree_contains(rt, txg, size);
 	mutex_exit(rt->rt_lock);
 
 	return (dirty);
 }
 
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
 	mutex_enter(rt->rt_lock);
 	empty = (range_tree_space(rt) == 0);
 	mutex_exit(rt->rt_lock);
 
 	return (empty);
 }
 
 /*
  * Returns the lowest txg in the DTL range.
  */
 static uint64_t
 vdev_dtl_min(vdev_t *vd)
 {
 	range_seg_t *rs;
 
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
 	return (rs->rs_start - 1);
 }
 
 /*
  * Returns the highest txg in the DTL.
  */
 static uint64_t
 vdev_dtl_max(vdev_t *vd)
 {
 	range_seg_t *rs;
 
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
 	return (rs->rs_end);
 }
 
 /*
  * Determine if a resilvering vdev should remove any DTL entries from
  * its range. If the vdev was resilvering for the entire duration of the
  * scan then it should excise that range from its DTLs. Otherwise, this
  * vdev is considered partially resilvered and should leave its DTL
  * entries intact. The comment in vdev_dtl_reassess() describes how we
  * excise the DTLs.
  */
 static boolean_t
 vdev_dtl_should_excise(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 
 	ASSERT0(scn->scn_phys.scn_errors);
 	ASSERT0(vd->vdev_children);
 
 	if (vd->vdev_resilver_txg == 0 ||
 	    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
 		return (B_TRUE);
 
 	/*
 	 * When a resilver is initiated the scan will assign the scn_max_txg
 	 * value to the highest txg value that exists in all DTLs. If this
 	 * device's max DTL is not part of this scan (i.e. it is not in
 	 * the range (scn_min_txg, scn_max_txg] then it is not eligible
 	 * for excision.
 	 */
 	if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
 		ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
 		ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
 		ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Reassess DTLs after a config change or scrub completion.
  */
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 {
 	spa_t *spa = vd->vdev_spa;
 	avl_tree_t reftree;
 	int minref;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dtl_reassess(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done);
 
 	if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * If we've completed a scan cleanly then determine
 		 * if this vdev should remove any DTLs. We only want to
 		 * excise regions on vdevs that were available during
 		 * the entire duration of this scan.
 		 */
 		if (scrub_txg != 0 &&
 		    (spa->spa_scrub_started ||
 		    (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
 		    vdev_dtl_should_excise(vd)) {
 			/*
 			 * We completed a scrub up to scrub_txg.  If we
 			 * did it without rebooting, then the scrub dtl
 			 * will be valid, so excise the old region and
 			 * fold in the scrub dtl.  Otherwise, leave the
 			 * dtl as-is if there was an error.
 			 *
 			 * There's little trick here: to excise the beginning
 			 * of the DTL_MISSING map, we put it into a reference
 			 * tree and then add a segment with refcnt -1 that
 			 * covers the range [0, scrub_txg).  This means
 			 * that each txg in that range has refcnt -1 or 0.
 			 * We then add DTL_SCRUB with a refcnt of 2, so that
 			 * entries in the range [0, scrub_txg) will have a
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
 			space_reftree_create(&reftree);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_SCRUB], 2);
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_destroy(&reftree);
 		}
 		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
 		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
 			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
 		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 		if (!vdev_readable(vd))
 			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		else
 			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
 
 		/*
 		 * If the vdev was resilvering and no longer has any
 		 * DTLs then reset its resilvering flag and dirty
 		 * the top level so that we persist the change.
 		 */
 		if (vd->vdev_resilver_txg != 0 &&
 		    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
 		    range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
 			vd->vdev_resilver_txg = 0;
 			vdev_config_dirty(vd->vdev_top);
 		}
 
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 		return;
 	}
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		/* account for child's outage in parent's missing map */
 		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
 		if (t == DTL_SCRUB)
 			continue;			/* leaf vdevs only */
 		if (t == DTL_PARTIAL)
 			minref = 1;			/* i.e. non-zero */
 		else if (vd->vdev_nparity != 0)
 			minref = vd->vdev_nparity + 1;	/* RAID-Z */
 		else
 			minref = vd->vdev_children;	/* any kind of mirror */
 		space_reftree_create(&reftree);
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			mutex_enter(&cvd->vdev_dtl_lock);
 			space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
 			mutex_exit(&cvd->vdev_dtl_lock);
 		}
 		space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
 		space_reftree_destroy(&reftree);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	int error = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
 		ASSERT(!vd->vdev_ishole);
 
 		error = space_map_open(&vd->vdev_dtl_sm, mos,
 		    vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
 		if (error)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * Now that we've opened the space_map we need to update
 		 * the in-core DTL.
 		 */
 		space_map_update(vd->vdev_dtl_sm);
 
 		error = space_map_load(vd->vdev_dtl_sm,
 		    vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		return (error);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		error = vdev_dtl_load(vd->vdev_child[c]);
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	range_tree_t *rtsync;
 	kmutex_t rtlock;
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(!vd->vdev_ishole);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		space_map_free(vd->vdev_dtl_sm, tx);
 		space_map_close(vd->vdev_dtl_sm);
 		vd->vdev_dtl_sm = NULL;
 		mutex_exit(&vd->vdev_dtl_lock);
 		dmu_tx_commit(tx);
 		return;
 	}
 
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
 		    0, -1ULL, 0, &vd->vdev_dtl_lock));
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
 	bzero(&rtlock, sizeof(rtlock));
 	mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
 
 	rtsync = range_tree_create(NULL, NULL, &rtlock);
 
 	mutex_enter(&rtlock);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
 	range_tree_vacate(rtsync, NULL, NULL);
 
 	range_tree_destroy(rtsync);
 
 	mutex_exit(&rtlock);
 	mutex_destroy(&rtlock);
 
 	/*
 	 * If the object for the space map has changed then dirty
 	 * the top level so that we update the config.
 	 */
 	if (object != space_map_object(vd->vdev_dtl_sm)) {
 		zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
 		    "new object %llu", txg, spa_name(spa), object,
 		    space_map_object(vd->vdev_dtl_sm));
 		vdev_config_dirty(vd->vdev_top);
 	}
 
 	dmu_tx_commit(tx);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_update(vd->vdev_dtl_sm);
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 /*
  * Determine whether the specified vdev can be offlined/detached/removed
  * without losing data.
  */
 boolean_t
 vdev_dtl_required(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint8_t cant_read = vd->vdev_cant_read;
 	boolean_t required;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == spa->spa_root_vdev || vd == tvd)
 		return (B_TRUE);
 
 	/*
 	 * Temporarily mark the device as unreadable, and then determine
 	 * whether this results in any DTL outages in the top-level vdev.
 	 * If not, we can safely offline/detach/remove the device.
 	 */
 	vd->vdev_cant_read = B_TRUE;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 
 	if (!required && zio_injection_enabled)
 		required = !!zio_handle_device_injection(vd, NULL, ECHILD);
 
 	return (required);
 }
 
 /*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 {
 	boolean_t needed = B_FALSE;
 	uint64_t thismin = UINT64_MAX;
 	uint64_t thismax = 0;
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
 		    vdev_writeable(vd)) {
 
 			thismin = vdev_dtl_min(vd);
 			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
 			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
 				thismin = MIN(thismin, cmin);
 				thismax = MAX(thismax, cmax);
 				needed = B_TRUE;
 			}
 		}
 	}
 
 	if (needed && minp) {
 		*minp = thismin;
 		*maxp = thismax;
 	}
 	return (needed);
 }
 
 void
 vdev_load(vdev_t *vd)
 {
 	/*
 	 * Recursively load all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_load(vd->vdev_child[c]);
 
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
 	if (vd == vd->vdev_top && !vd->vdev_ishole &&
 	    (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
 	    vdev_metaslab_init(vd, 0) != 0))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 
 	/*
 	 * If this is a leaf vdev, load its DTL.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 }
 
 /*
  * The special vdev case is used for hot spares and l2cache devices.  Its
  * sole purpose it to set the vdev state for the associated vdev.  To do this,
  * we make sure that we can open the underlying device, then try to read the
  * label, and make sure that the label is sane and that it hasn't been
  * repurposed to another pool.
  */
 int
 vdev_validate_aux(vdev_t *vd)
 {
 	nvlist_t *label;
 	uint64_t guid, version;
 	uint64_t state;
 
 	if (!vdev_readable(vd))
 		return (0);
 
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
 	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		return (-1);
 	}
 
 	/*
 	 * We don't actually check the pool state here.  If it's in fact in
 	 * use by another pool, we update this fact on the fly when requested.
 	 */
 	nvlist_free(label);
 	return (0);
 }
 
 void
 vdev_remove(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	dmu_tx_t *tx;
 
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_t *mg = vd->vdev_mg;
 
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 
 		for (int m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
 			if (msp == NULL || msp->ms_sm == NULL)
 				continue;
 
 			mutex_enter(&msp->ms_lock);
 			/*
 			 * If the metaslab was not loaded when the vdev
 			 * was removed then the histogram accounting may
 			 * not be accurate. Update the histogram information
 			 * here so that we ensure that the metaslab group
 			 * and metaslab class are up-to-date.
 			 */
 			metaslab_group_histogram_remove(mg, msp);
 
 			VERIFY0(space_map_allocated(msp->ms_sm));
 			space_map_free(msp->ms_sm, tx);
 			space_map_close(msp->ms_sm);
 			msp->ms_sm = NULL;
 			mutex_exit(&msp->ms_lock);
 		}
 
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 			ASSERT0(mg->mg_histogram[i]);
 
 	}
 
 	if (vd->vdev_ms_array) {
 		(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
 		vd->vdev_ms_array = 0;
 	}
 	dmu_tx_commit(tx);
 }
 
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	ASSERT(!vd->vdev_ishole);
 
 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 		metaslab_sync_done(msp, txg);
 
 	if (reassess)
 		metaslab_sync_reassess(vd->vdev_mg);
 }
 
 void
 vdev_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
 	dmu_tx_t *tx;
 
 	ASSERT(!vd->vdev_ishole);
 
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
 		ASSERT(vd == vd->vdev_top);
 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
 		ASSERT(vd->vdev_ms_array != 0);
 		vdev_config_dirty(vd);
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Remove the metadata associated with this vdev once it's empty.
 	 */
 	if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove(vd, txg);
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
 	}
 
 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
 		vdev_dtl_sync(lvd, txg);
 
 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
 }
 
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
 }
 
 /*
  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
  * not be opened, and no I/O is attempted.
  */
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd, *tvd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 
 	/*
 	 * We don't directly use the aux state here, but if we do a
 	 * vdev_reopen(), we need this value to be present to remember why we
 	 * were faulted.
 	 */
 	vd->vdev_label_aux = aux;
 
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
 	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
 	 * If this device has the only valid copy of the data, then
 	 * back off and simply mark the vdev as degraded instead.
 	 */
 	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
 		/*
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
 		vdev_reopen(tvd);
 
 		if (vdev_readable(vd))
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
  * user that something is wrong.  The vdev continues to operate as normal as far
  * as I/O is concerned.
  */
 int
 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	/*
 	 * If the vdev is already faulted, then don't do anything.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
 		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Online the given vdev.
  *
  * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
  * spare device should be detached when the device finishes resilvering.
  * Second, the online should be treated like a 'test' online case, so no FMA
  * events are generated if the device fails to open.
  */
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 	boolean_t postevent = B_FALSE;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	postevent =
 	    (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ?
 	    B_TRUE : B_FALSE;
 
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
 
 	/* XXX - L2ARC 1.0 does not support expansion */
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
 	}
 
 	vdev_reopen(tvd);
 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = B_FALSE;
 	}
 
 	if (newstate)
 		*newstate = vd->vdev_state;
 	if ((flags & ZFS_ONLINE_UNSPARE) &&
 	    !vdev_is_dead(vd) && vd->vdev_parent &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
 
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
 	if (postevent)
 		spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 static int
 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
 	int error = 0;
 	uint64_t generation;
 	metaslab_group_t *mg;
 
 top:
 	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
 	if (!vd->vdev_offline) {
 		/*
 		 * If this device has the only valid copy of some data,
 		 * don't allow it to be offlined. Log devices are always
 		 * expendable.
 		 */
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
 		/*
 		 * If the top-level is a slog and it has had allocations
 		 * then proceed.  We check that the vdev's metaslab group
 		 * is not NULL since it's possible that we may have just
 		 * added this vdev but not yet initialized its metaslabs.
 		 */
 		if (tvd->vdev_islog && mg != NULL) {
 			/*
 			 * Prevent any future allocations.
 			 */
 			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
 			error = spa_offline_log(spa);
 
 			spa_vdev_state_enter(spa, SCL_ALLOC);
 
 			/*
 			 * Check to see if the config has changed.
 			 */
 			if (error || generation != spa->spa_config_generation) {
 				metaslab_group_activate(mg);
 				if (error)
 					return (spa_vdev_state_exit(spa,
 					    vd, error));
 				(void) spa_vdev_state_exit(spa, vd, 0);
 				goto top;
 			}
 			ASSERT0(tvd->vdev_stat.vs_alloc);
 		}
 
 		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
 		 * vdev becoming unusable, undo it and fail the request.
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(tvd);
 
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_is_dead(tvd)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 		}
 
 		/*
 		 * Add the device back into the metaslab rotor so that
 		 * once we online the device it's open for business.
 		 */
 		if (tvd->vdev_islog && mg != NULL)
 			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	error = vdev_offline_locked(spa, guid, flags);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
  */
 void
 vdev_clear(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == NULL)
 		vd = rvd;
 
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
 	if (vd == rvd) {
 		for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
 			vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
 
 		for (int c = 0; c < spa->spa_spares.sav_count; c++)
 			vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
 	}
 
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
 	 * also mark the vdev config dirty, so that the new faulted state is
 	 * written out to disk.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 
 		/*
 		 * When reopening in reponse to a clear event, it may be due to
 		 * a fmadm repair request.  In this case, if the device is
 		 * still broken, we want to still post the ereport again.
 		 */
 		vd->vdev_forcefault = B_TRUE;
 
 		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 
 		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
 		vd->vdev_forcefault = B_FALSE;
 
 		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 		spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
 	}
 
 	/*
 	 * When clearing a FMA-diagnosed fault, we always want to
 	 * unspare the device, as we assume that the original spare was
 	 * done in response to the FMA fault.
 	 */
 	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
 	/*
 	 * Holes and missing devices are always considered "dead".
 	 * This simplifies the code since we don't have to check for
 	 * these types of devices in the various code paths.
 	 * Instead we rely on the fact that we skip over dead devices
 	 * before issuing I/O to them.
 	 */
 	return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
 	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
 vdev_readable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
 }
 
 boolean_t
 vdev_writeable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
 }
 
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
 	uint64_t state = vd->vdev_state;
 
 	/*
 	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
 	 * the proper locks.  Note that we have to get the vdev state
 	 * in a local variable because although it changes atomically,
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write && !vd->vdev_ishole);
 }
 
 boolean_t
 vdev_accessible(vdev_t *vd, zio_t *zio)
 {
 	ASSERT(zio->io_vd == vd);
 
 	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
 		return (B_FALSE);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		return (!vd->vdev_cant_read);
 
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		return (!vd->vdev_cant_write);
 
 	return (B_TRUE);
 }
 
 /*
  * Get statistics for the given vdev.
  */
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_min_asize(vd);
 	if (vd->vdev_ops->vdev_op_leaf)
 		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 	if (vd->vdev_max_asize != 0)
 		vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
 	vs->vs_configured_ashift = vd->vdev_top != NULL
 	    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 	vs->vs_logical_ashift = vd->vdev_logical_ashift;
 	vs->vs_physical_ashift = vd->vdev_physical_ashift;
 	if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
 		vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
 	}
 
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
 	 * over all top-level vdevs (i.e. the direct children of the root).
 	 */
 	if (vd == rvd) {
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *cvd = rvd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 
 			for (int t = 0; t < ZIO_TYPES; t++) {
 				vs->vs_ops[t] += cvs->vs_ops[t];
 				vs->vs_bytes[t] += cvs->vs_bytes[t];
 			}
 			cvs->vs_scan_removing = cvd->vdev_removing;
 		}
 	}
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_clear_stats(vdev_t *vd)
 {
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_space = 0;
 	vd->vdev_stat.vs_dspace = 0;
 	vd->vdev_stat.vs_alloc = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_scan_stat_init(vdev_t *vd)
 {
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_scan_stat_init(vd->vdev_child[c]);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vs->vs_scan_processed = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
 	/*
 	 * If this i/o is a gang leader, it didn't do any actual work.
 	 */
 	if (zio->io_gang_tree)
 		return;
 
 	if (zio->io_error == 0) {
 		/*
 		 * If this is a root i/o, don't count it -- we've already
 		 * counted the top-level vdevs, and vdev_get_stats() will
 		 * aggregate them when asked.  This reduces contention on
 		 * the root vdev_stat_lock and implicitly handles blocks
 		 * that compress away to holes, for which there is no i/o.
 		 * (Holes never create vdev children, so all the counters
 		 * remain zero, which is what we want.)
 		 *
 		 * Note: this only applies to successful i/o (io_error == 0)
 		 * because unlike i/o counts, errors are not additive.
 		 * When reading a ditto block, for example, failure of
 		 * one top-level vdev does not imply a root-level error.
 		 */
 		if (vd == rvd)
 			return;
 
 		ASSERT(vd == zio->io_vd);
 
 		if (flags & ZIO_FLAG_IO_BYPASS)
 			return;
 
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				dsl_scan_phys_t *scn_phys =
 				    &spa->spa_dsl_pool->dp_scan->scn_phys;
 				uint64_t *processed = &scn_phys->scn_processed;
 
 				/* XXX cleanup? */
 				if (vd->vdev_ops->vdev_op_leaf)
 					atomic_add_64(processed, psize);
 				vs->vs_scan_processed += psize;
 			}
 
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
 
 		vs->vs_ops[type]++;
 		vs->vs_bytes[type] += psize;
 
 		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	/*
 	 * If this is an I/O error that is going to be retried, then ignore the
 	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
 	 * hard errors, when in reality they can happen for any number of
 	 * innocuous reasons (bus resets, MPxIO link failure, etc).
 	 */
 	if (zio->io_error == EIO &&
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
 	/*
 	 * Intent logs writes won't propagate their error to the root
 	 * I/O so don't mark these types of failures as pool-level
 	 * errors.
 	 */
 	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		return;
 
 	mutex_enter(&vd->vdev_stat_lock);
 	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
 		if (zio->io_error == ECKSUM)
 			vs->vs_checksum_errors++;
 		else
 			vs->vs_read_errors++;
 	}
 	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
 		vs->vs_write_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
 	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
 		 * a repair induced by the scrub thread, or it's a repair
 		 * made by zil_claim() during spa_load() in the first txg.
 		 * In the normal case, we commit the DTL change in the same
 		 * txg as the block was born.  In the scrub-induced repair
 		 * case, we know that scrubs run in first-pass syncing context,
 		 * so we commit the DTL change in spa_syncing_txg(spa).
 		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
 		 * reads, because we have no transactional context in which to
 		 * do so -- and it's not clear that it'd be desirable anyway.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
 				commit_txg = spa_syncing_txg(spa);
 			} else if (spa->spa_claiming) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				commit_txg = spa_first_txg(spa);
 			}
 			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
 		if (vd != rvd)
 			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
 /*
  * Update the in-core space usage stats for this vdev, its metaslab class,
  * and the root vdev.
  */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
 	int64_t dspace_delta = space_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_group_t *mg = vd->vdev_mg;
 	metaslab_class_t *mc = mg ? mg->mg_class : NULL;
 
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
 	 * factor.  We must calculate this here and not at the root vdev
 	 * because the root vdev's psize-to-asize is simply the max of its
 	 * childrens', thus not accurate enough for us.
 	 */
 	ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
 	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
 	    vd->vdev_deflate_ratio;
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	if (mc == spa_normal_class(spa)) {
 		mutex_enter(&rvd->vdev_stat_lock);
 		rvd->vdev_stat.vs_alloc += alloc_delta;
 		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
 
 	if (mc != NULL) {
 		ASSERT(rvd == vd->vdev_parent);
 		ASSERT(vd->vdev_ms_count != 0);
 
 		metaslab_class_space_update(mc,
 		    alloc_delta, defer_delta, space_delta, dspace_delta);
 	}
 }
 
 /*
  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  * so that it will be written out next time the vdev configuration is synced.
  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
  */
 void
 vdev_config_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_writeable(spa));
 
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
 		nvlist_t **aux;
 		uint_t naux;
 
 		for (c = 0; c < sav->sav_count; c++) {
 			if (sav->sav_vdevs[c] == vd)
 				break;
 		}
 
 		if (c == sav->sav_count) {
 			/*
 			 * We're being removed.  There's nothing more to do.
 			 */
 			ASSERT(sav->sav_sync == B_TRUE);
 			return;
 		}
 
 		sav->sav_sync = B_TRUE;
 
 		if (nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
 			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
 		}
 
 		ASSERT(c < naux);
 
 		/*
 		 * Setting the nvlist in the middle if the array is a little
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
 		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
 
 	/*
 	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
 	 * must either hold SCL_CONFIG as writer, or must be the sync thread
 	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	if (vd == rvd) {
 		for (c = 0; c < rvd->vdev_children; c++)
 			vdev_config_dirty(rvd->vdev_child[c]);
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
 		if (!list_link_active(&vd->vdev_config_dirty_node) &&
 		    !vd->vdev_ishole)
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 	}
 }
 
 void
 vdev_config_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
 	list_remove(&spa->spa_config_dirty_list, vd);
 }
 
 /*
  * Mark a top-level vdev's state as dirty, so that the next pass of
  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
  * the state changes from larger config changes because they require
  * much less locking, and are often needed for administrative actions.
  */
 void
 vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * The state list is protected by the SCL_STATE lock.  The caller
 	 * must either hold SCL_STATE as writer, or must be the sync thread
 	 * (which holds SCL_STATE as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
 void
 vdev_state_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
 	list_remove(&spa->spa_state_dirty_list, vd);
 }
 
 /*
  * Propagate vdev state up from children to parent.
  */
 void
 vdev_propagate_state(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	vdev_t *child;
 
 	if (vd->vdev_children > 0) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
 			/*
 			 * Don't factor holes into the decision.
 			 */
 			if (child->vdev_ishole)
 				continue;
 
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
 				 * degraded.
 				 */
 				if (child->vdev_islog && vd == rvd)
 					degraded++;
 				else
 					faulted++;
 			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
 				degraded++;
 			}
 
 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
 				corrupted++;
 		}
 
 		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
 
 		/*
 		 * Root special: if there is a top-level vdev that cannot be
 		 * opened due to corrupted metadata, then propagate the root
 		 * vdev's aux state as 'corrupt' rather than 'insufficient
 		 * replicas'.
 		 */
 		if (corrupted && vd == rvd &&
 		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
 			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 	}
 
 	if (vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
  * Otherwise, we propagate the change to the parent.
  *
  * If this routine places a device in a faulted state, an appropriate ereport is
  * generated.
  */
 void
 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
 	uint64_t save_state;
 	spa_t *spa = vd->vdev_spa;
 
 	if (state == vd->vdev_state) {
 		vd->vdev_stat.vs_aux = aux;
 		return;
 	}
 
 	save_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
 	 * always close the underlying device unless the device has requested
 	 * a delayed close (i.e. we're about to remove or fault the device).
 	 * Otherwise, we keep accessible but invalid devices open forever.
 	 * We don't call vdev_close() itself, because that implies some extra
 	 * checks (offline, etc) that we don't want here.  This is limited to
 	 * leaf devices, because otherwise closing the device will affect other
 	 * children.
 	 */
 	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
 	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
 	/*
 	 * If we have brought this vdev back into service, we need
 	 * to notify fmd so that it can gracefully repair any outstanding
 	 * cases due to a missing device.  We do this in all cases, even those
 	 * that probably don't correlate to a repaired fault.  This is sure to
 	 * catch all cases, and we let the zfs-retire agent sort it out.  If
 	 * this is a transient state it's OK, as the retire agent will
 	 * double-check the state of the vdev before repairing it.
 	 */
 	if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_prevstate != state)
 		zfs_post_state_change(spa, vd);
 
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
 		/*
 		 * If the previous state is set to VDEV_STATE_REMOVED, then this
 		 * device was previously marked removed and someone attempted to
 		 * reopen it.  If this failed due to a nonexistent device, then
 		 * keep the device in the REMOVED state.  We also let this be if
 		 * it is one of our special test online cases, which is only
 		 * attempting to online the device and shouldn't generate an FMA
 		 * fault.
 		 */
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import or recovery, we
 		 * mark it as "not available", which signifies that it was
 		 * never there to begin with.  Failure to open such a device
 		 * is not considered an error.
 		 */
 		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
 		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
 		/*
 		 * Post the appropriate ereport.  If the 'prevstate' field is
 		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
 		 *
 		 * If the 'checkremove' flag is set, then this is an attempt to
 		 * online the device in response to an insertion event.  If we
 		 * hit this case, then we have detected an insertion event for a
 		 * faulted or offline device that wasn't in the removed state.
 		 * In this scenario, we don't post an ereport because we are
 		 * about to replace the device, or attempt an online with
 		 * vdev_forcefault, which will generate the fault for us.
 		 */
 		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
 		    !vd->vdev_not_present && !vd->vdev_checkremove &&
 		    vd != spa->spa_root_vdev) {
 			const char *class;
 
 			switch (aux) {
 			case VDEV_AUX_OPEN_FAILED:
 				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
 				break;
 			case VDEV_AUX_CORRUPT_DATA:
 				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
 				break;
 			case VDEV_AUX_NO_REPLICAS:
 				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
 				break;
 			case VDEV_AUX_BAD_GUID_SUM:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
 				break;
 			case VDEV_AUX_TOO_SMALL:
 				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
 				break;
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
 
 			zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
 		}
 
 		/* Erase any notion of persistent removed state */
 		vd->vdev_removed = B_FALSE;
 	} else {
 		vd->vdev_removed = B_FALSE;
 	}
 
 	if (!isopen && vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
  * a root pool.
  *
  * On Solaris, we do not support RAID-Z or partial configuration.  In
  * addition, only a single top-level vdev is allowed and none of the
  * leaves can be wholedisks.
  *
  * For FreeBSD, we can boot from any configuration. There is a
  * limitation that the boot filesystem must be either uncompressed or
  * compresses with lzjb compression but I'm not sure how to enforce
  * that here.
  */
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
 #ifdef illumos
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		char *vdev_type = vd->vdev_ops->vdev_op_type;
 
 		if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
 		    vd->vdev_children > 1) {
 			return (B_FALSE);
 		} else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
 		    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
 			return (B_FALSE);
 		}
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
 #endif	/* illumos */
 	return (B_TRUE);
 }
 
 /*
  * Load the state from the original vdev tree (ovd) which
  * we've retrieved from the MOS config object. If the original
  * vdev was offline or faulted then we transfer that state to the
  * device in the current vdev tree (nvd).
  */
 void
 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
 {
 	spa_t *spa = nvd->vdev_spa;
 
 	ASSERT(nvd->vdev_top->vdev_islog);
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
 
 	for (int c = 0; c < nvd->vdev_children; c++)
 		vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
 
 	if (nvd->vdev_ops->vdev_op_leaf) {
 		/*
 		 * Restore the persistent vdev state
 		 */
 		nvd->vdev_offline = ovd->vdev_offline;
 		nvd->vdev_faulted = ovd->vdev_faulted;
 		nvd->vdev_degraded = ovd->vdev_degraded;
 		nvd->vdev_removed = ovd->vdev_removed;
 	}
 }
 
 /*
  * Determine if a log device has valid content.  If the vdev was
  * removed or faulted in the MOS config then we know that
  * the content on the log device has already been written to the pool.
  */
 boolean_t
 vdev_log_state_valid(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
 	    !vd->vdev_removed)
 		return (B_TRUE);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_log_state_valid(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Expand a vdev if possible.
  */
 void
 vdev_expand(vdev_t *vd, uint64_t txg)
 {
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
 		VERIFY(vdev_metaslab_init(vd, txg) == 0);
 		vdev_config_dirty(vd);
 	}
 }
 
 /*
  * Split a vdev.
  */
 void
 vdev_split(vdev_t *vd)
 {
 	vdev_t *cvd, *pvd = vd->vdev_parent;
 
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	cvd = pvd->vdev_child[0];
 	if (pvd->vdev_children == 1) {
 		vdev_remove_parent(cvd);
 		cvd->vdev_splitting = B_TRUE;
 	}
 	vdev_propagate_state(cvd);
 }
 
 void
 vdev_deadman(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		vdev_deadman(cvd);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
 		if (avl_numnodes(&vq->vq_active_tree) > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime we panic the system.
 			 */
 			fio = avl_first(&vq->vq_active_tree);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa)) {
 				zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
 				    "delta %lluns, last io %lluns",
 				    fio->io_timestamp, delta,
 				    vq->vq_io_complete_ts);
 				fm_panic("I/O to pool '%s' appears to be "
 				    "hung on vdev guid %llu at '%s'.",
 				    spa_name(spa),
 				    (long long unsigned int) vd->vdev_guid,
 				    vd->vdev_path);
 			}
 		}
 		mutex_exit(&vq->vq_lock);
 	}
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	(revision 296519)
@@ -1,915 +1,916 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
 
 /*
  * ZFS I/O Scheduler
  * ---------------
  *
  * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
  * I/O scheduler determines when and in what order those operations are
  * issued.  The I/O scheduler divides operations into six I/O classes
  * prioritized in the following order: sync read, sync write, async read,
  * async write, scrub/resilver and trim.  Each queue defines the minimum and
  * maximum number of concurrent operations that may be issued to the device.
  * In addition, the device has an aggregate maximum. Note that the sum of the
  * per-queue minimums must not exceed the aggregate maximum, and if the
  * aggregate maximum is equal to or greater than the sum of the per-queue
  * maximums, the per-queue minimum has no effect.
  *
  * For many physical devices, throughput increases with the number of
  * concurrent operations, but latency typically suffers. Further, physical
  * devices typically have a limit at which more concurrent operations have no
  * effect on throughput or can actually cause it to decrease.
  *
  * The scheduler selects the next operation to issue by first looking for an
  * I/O class whose minimum has not been satisfied. Once all are satisfied and
  * the aggregate maximum has not been hit, the scheduler looks for classes
  * whose maximum has not been satisfied. Iteration through the I/O classes is
  * done in the order specified above. No further operations are issued if the
  * aggregate maximum number of concurrent operations has been hit or if there
  * are no operations queued for an I/O class that has not hit its maximum.
  * Every time an I/O is queued or an operation completes, the I/O scheduler
  * looks for new operations to issue.
  *
  * All I/O classes have a fixed maximum number of outstanding operations
  * except for the async write class. Asynchronous writes represent the data
  * that is committed to stable storage during the syncing stage for
  * transaction groups (see txg.c). Transaction groups enter the syncing state
  * periodically so the number of queued async writes will quickly burst up and
  * then bleed down to zero. Rather than servicing them as quickly as possible,
  * the I/O scheduler changes the maximum number of active async write I/Os
  * according to the amount of dirty data in the pool (see dsl_pool.c). Since
  * both throughput and latency typically increase with the number of
  * concurrent operations issued to physical devices, reducing the burstiness
  * in the number of concurrent operations also stabilizes the response time of
  * operations from other -- and in particular synchronous -- queues. In broad
  * strokes, the I/O scheduler will issue more concurrent operations from the
  * async write queue as there's more dirty data in the pool.
  *
  * Async Writes
  *
  * The number of concurrent operations issued for the async write I/O class
  * follows a piece-wise linear function defined by a few adjustable points.
  *
  *        |                   o---------| <-- zfs_vdev_async_write_max_active
  *   ^    |                  /^         |
  *   |    |                 / |         |
  * active |                /  |         |
  *  I/O   |               /   |         |
  * count  |              /    |         |
  *        |             /     |         |
  *        |------------o      |         | <-- zfs_vdev_async_write_min_active
  *       0|____________^______|_________|
  *        0%           |      |       100% of zfs_dirty_data_max
  *                     |      |
  *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
  *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
  *
  * Until the amount of dirty data exceeds a minimum percentage of the dirty
  * data allowed in the pool, the I/O scheduler will limit the number of
  * concurrent operations to the minimum. As that threshold is crossed, the
  * number of concurrent operations issued increases linearly to the maximum at
  * the specified maximum percentage of the dirty data allowed in the pool.
  *
  * Ideally, the amount of dirty data on a busy pool will stay in the sloped
  * part of the function between zfs_vdev_async_write_active_min_dirty_percent
  * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
  * maximum percentage, this indicates that the rate of incoming data is
  * greater than the rate that the backend storage can handle. In this case, we
  * must further throttle incoming writes (see dmu_tx_delay() for details).
  */
 
 /*
  * The maximum number of I/Os active to each device.  Ideally, this will be >=
  * the sum of each queue's max_active.  It must be at least the sum of each
  * queue's min_active.
  */
 uint32_t zfs_vdev_max_active = 1000;
 
 /*
  * Per-queue limits on the number of I/Os active to each device.  If the
  * sum of the queue's max_active is < zfs_vdev_max_active, then the
  * min_active comes into play.  We will send min_active from each queue,
  * and then select from queues in the order defined by zio_priority_t.
  *
  * In general, smaller max_active's will lead to lower latency of synchronous
  * operations.  Larger max_active's may lead to higher overall throughput,
  * depending on underlying storage.
  *
  * The ratio of the queues' max_actives determines the balance of performance
  * between reads, writes, and scrubs.  E.g., increasing
  * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
  * more quickly, but reads and writes to have higher latency and lower
  * throughput.
  */
 uint32_t zfs_vdev_sync_read_min_active = 10;
 uint32_t zfs_vdev_sync_read_max_active = 10;
 uint32_t zfs_vdev_sync_write_min_active = 10;
 uint32_t zfs_vdev_sync_write_max_active = 10;
 uint32_t zfs_vdev_async_read_min_active = 1;
 uint32_t zfs_vdev_async_read_max_active = 3;
 uint32_t zfs_vdev_async_write_min_active = 1;
 uint32_t zfs_vdev_async_write_max_active = 10;
 uint32_t zfs_vdev_scrub_min_active = 1;
 uint32_t zfs_vdev_scrub_max_active = 2;
 uint32_t zfs_vdev_trim_min_active = 1;
 /*
  * TRIM max active is large in comparison to the other values due to the fact
  * that TRIM IOs are coalesced at the device layer. This value is set such
  * that a typical SSD can process the queued IOs in a single request.
  */
 uint32_t zfs_vdev_trim_max_active = 64;
 
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
  * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
  * zfs_vdev_async_write_active_max_dirty_percent, use
  * zfs_vdev_async_write_max_active. The value is linearly interpolated
  * between min and max.
  */
 int zfs_vdev_async_write_active_min_dirty_percent = 30;
 int zfs_vdev_async_write_active_max_dirty_percent = 60;
 
 /*
  * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
  * For read I/Os, we also aggregate across small adjacency gaps; for writes
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
 int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
 #ifdef __FreeBSD__
 SYSCTL_DECL(_vfs_zfs_vdev);
 
 static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
     sysctl_zfs_async_write_active_min_dirty_percent, "I",
     "Percentage of async write dirty data below which "
     "async_write_min_active is used.");
 
 static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
     sysctl_zfs_async_write_active_max_dirty_percent, "I",
     "Percentage of async write dirty data above which "
     "async_write_max_active is used.");
 
 SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
     &zfs_vdev_max_active, 0,
     "The maximum number of I/Os of all types active for each device.");
 
 #define ZFS_VDEV_QUEUE_KNOB_MIN(name)					\
 SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
     &zfs_vdev_ ## name ## _min_active, 0,				\
     "Initial number of I/O requests of type " #name			\
     " active for each device");
 
 #define ZFS_VDEV_QUEUE_KNOB_MAX(name)					\
 SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\
     &zfs_vdev_ ## name ## _max_active, 0,				\
     "Maximum number of I/O requests of type " #name			\
     " active for each device");
 
 ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
 ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
 ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
 ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
 ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
 ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
 ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
 ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
 ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
 ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
 ZFS_VDEV_QUEUE_KNOB_MIN(trim);
 ZFS_VDEV_QUEUE_KNOB_MAX(trim);
 
 #undef ZFS_VDEV_QUEUE_KNOB
 
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
     &zfs_vdev_aggregation_limit, 0,
     "I/O requests are aggregated up to this size");
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
     &zfs_vdev_read_gap_limit, 0,
     "Acceptable gap between two reads being aggregated");
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
     &zfs_vdev_write_gap_limit, 0,
     "Acceptable gap between two writes being aggregated");
 
 static int
 sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
 {
 	int val, err;
 
 	val = zfs_vdev_async_write_active_min_dirty_percent;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 	
 	if (val < 0 || val > 100 ||
 	    val >= zfs_vdev_async_write_active_max_dirty_percent)
 		return (EINVAL);
 
 	zfs_vdev_async_write_active_min_dirty_percent = val;
 
 	return (0);
 }
 
 static int
 sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
 {
 	int val, err;
 
 	val = zfs_vdev_async_write_active_max_dirty_percent;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < 0 || val > 100 ||
 	    val <= zfs_vdev_async_write_active_min_dirty_percent)
 		return (EINVAL);
 
 	zfs_vdev_async_write_active_max_dirty_percent = val;
 
 	return (0);
 }
 #endif
 
 int
 vdev_queue_offset_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
 	if (z1->io_offset < z2->io_offset)
 		return (-1);
 	if (z1->io_offset > z2->io_offset)
 		return (1);
 
 	if (z1 < z2)
 		return (-1);
 	if (z1 > z2)
 		return (1);
 
 	return (0);
 }
 
 static inline avl_tree_t *
 vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
 {
 	return (&vq->vq_class[p].vqc_queued_tree);
 }
 
 static inline avl_tree_t *
 vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
 {
 	if (t == ZIO_TYPE_READ)
 		return (&vq->vq_read_offset_tree);
 	else if (t == ZIO_TYPE_WRITE)
 		return (&vq->vq_write_offset_tree);
 	else
 		return (NULL);
 }
 
 int
 vdev_queue_timestamp_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
 	if (z1->io_timestamp < z2->io_timestamp)
 		return (-1);
 	if (z1->io_timestamp > z2->io_timestamp)
 		return (1);
 
 	if (z1->io_offset < z2->io_offset)
 		return (-1);
 	if (z1->io_offset > z2->io_offset)
 		return (1);
 
 	if (z1 < z2)
 		return (-1);
 	if (z1 > z2)
 		return (1);
 
 	return (0);
 }
 
 void
 vdev_queue_init(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
 	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 	vq->vq_vdev = vd;
 
 	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
 	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 
 	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		int (*compfn) (const void *, const void *);
 
 		/*
 		 * The synchronous i/o queues are dispatched in FIFO rather
 		 * than LBA order.  This provides more consistent latency for
 		 * these i/os.
 		 */
 		if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
 			compfn = vdev_queue_timestamp_compare;
 		else
 			compfn = vdev_queue_offset_compare;
 
 		avl_create(vdev_queue_class_tree(vq, p), compfn,
 		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	}
 
 	vq->vq_lastoffset = 0;
 }
 
 void
 vdev_queue_fini(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
 	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
 		avl_destroy(vdev_queue_class_tree(vq, p));
 	avl_destroy(&vq->vq_active_tree);
 	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
 	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 
 	mutex_destroy(&vq->vq_lock);
 }
 
 static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	avl_tree_t *qtt;
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	qtt = vdev_queue_type_tree(vq, zio->io_type);
 	if (qtt)
 		avl_add(qtt, zio);
 
 #ifdef illumos
 	mutex_enter(&spa->spa_iokstat_lock);
 	spa->spa_queue_stats[zio->io_priority].spa_queued++;
 	if (spa->spa_iokstat != NULL)
 		kstat_waitq_enter(spa->spa_iokstat->ks_data);
 	mutex_exit(&spa->spa_iokstat_lock);
 #endif
 }
 
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	avl_tree_t *qtt;
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	qtt = vdev_queue_type_tree(vq, zio->io_type);
 	if (qtt)
 		avl_remove(qtt, zio);
 
 #ifdef illumos
 	mutex_enter(&spa->spa_iokstat_lock);
 	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
 	spa->spa_queue_stats[zio->io_priority].spa_queued--;
 	if (spa->spa_iokstat != NULL)
 		kstat_waitq_exit(spa->spa_iokstat->ks_data);
 	mutex_exit(&spa->spa_iokstat_lock);
 #endif
 }
 
 static void
 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	vq->vq_class[zio->io_priority].vqc_active++;
 	avl_add(&vq->vq_active_tree, zio);
 
 #ifdef illumos
 	mutex_enter(&spa->spa_iokstat_lock);
 	spa->spa_queue_stats[zio->io_priority].spa_active++;
 	if (spa->spa_iokstat != NULL)
 		kstat_runq_enter(spa->spa_iokstat->ks_data);
 	mutex_exit(&spa->spa_iokstat_lock);
 #endif
 }
 
 static void
 vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	vq->vq_class[zio->io_priority].vqc_active--;
 	avl_remove(&vq->vq_active_tree, zio);
 
 #ifdef illumos
 	mutex_enter(&spa->spa_iokstat_lock);
 	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
 	spa->spa_queue_stats[zio->io_priority].spa_active--;
 	if (spa->spa_iokstat != NULL) {
 		kstat_io_t *ksio = spa->spa_iokstat->ks_data;
 
 		kstat_runq_exit(spa->spa_iokstat->ks_data);
 		if (zio->io_type == ZIO_TYPE_READ) {
 			ksio->reads++;
 			ksio->nread += zio->io_size;
 		} else if (zio->io_type == ZIO_TYPE_WRITE) {
 			ksio->writes++;
 			ksio->nwritten += zio->io_size;
 		}
 	}
 	mutex_exit(&spa->spa_iokstat_lock);
 #endif
 }
 
 static void
 vdev_queue_agg_io_done(zio_t *aio)
 {
 	if (aio->io_type == ZIO_TYPE_READ) {
 		zio_t *pio;
 		while ((pio = zio_walk_parents(aio)) != NULL) {
 			bcopy((char *)aio->io_data + (pio->io_offset -
 			    aio->io_offset), pio->io_data, pio->io_size);
 		}
 	}
 
 	zio_buf_free(aio->io_data, aio->io_size);
 }
 
 static int
 vdev_queue_class_min_active(zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
 		return (zfs_vdev_sync_read_min_active);
 	case ZIO_PRIORITY_SYNC_WRITE:
 		return (zfs_vdev_sync_write_min_active);
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_min_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
 		return (zfs_vdev_async_write_min_active);
 	case ZIO_PRIORITY_SCRUB:
 		return (zfs_vdev_scrub_min_active);
 	case ZIO_PRIORITY_TRIM:
 		return (zfs_vdev_trim_min_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
 	}
 }
 
 static __noinline int
 vdev_queue_max_async_writes(spa_t *spa)
 {
 	int writes;
 	uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
 	uint64_t min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint64_t max_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_max_dirty_percent / 100;
 
 	/*
 	 * Sync tasks correspond to interactive user actions. To reduce the
 	 * execution time of those actions we push data out as fast as possible.
 	 */
 	if (spa_has_pending_synctask(spa)) {
 		return (zfs_vdev_async_write_max_active);
 	}
 
 	if (dirty < min_bytes)
 		return (zfs_vdev_async_write_min_active);
 	if (dirty > max_bytes)
 		return (zfs_vdev_async_write_max_active);
 
 	/*
 	 * linear interpolation:
 	 * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
 	 * move right by min_bytes
 	 * move up by min_writes
 	 */
 	writes = (dirty - min_bytes) *
 	    (zfs_vdev_async_write_max_active -
 	    zfs_vdev_async_write_min_active) /
 	    (max_bytes - min_bytes) +
 	    zfs_vdev_async_write_min_active;
 	ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
 	ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
 	return (writes);
 }
 
 static int
 vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
 		return (zfs_vdev_sync_read_max_active);
 	case ZIO_PRIORITY_SYNC_WRITE:
 		return (zfs_vdev_sync_write_max_active);
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_max_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
 		return (vdev_queue_max_async_writes(spa));
 	case ZIO_PRIORITY_SCRUB:
 		return (zfs_vdev_scrub_max_active);
 	case ZIO_PRIORITY_TRIM:
 		return (zfs_vdev_trim_max_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
 	}
 }
 
 /*
  * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
  * there is no eligible class.
  */
 static zio_priority_t
 vdev_queue_class_to_issue(vdev_queue_t *vq)
 {
 	spa_t *spa = vq->vq_vdev->vdev_spa;
 	zio_priority_t p;
 
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
 	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
 		return (ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	/* find a queue that has not reached its minimum # outstanding i/os */
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 		    vq->vq_class[p].vqc_active <
 		    vdev_queue_class_min_active(p))
 			return (p);
 	}
 
 	/*
 	 * If we haven't found a queue, look for one that hasn't reached its
 	 * maximum # outstanding i/os.
 	 */
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 		    vq->vq_class[p].vqc_active <
 		    vdev_queue_class_max_active(spa, p))
 			return (p);
 	}
 
 	/* No eligible queued i/os */
 	return (ZIO_PRIORITY_NUM_QUEUEABLE);
 }
 
 /*
  * Compute the range spanned by two i/os, which is the endpoint of the last
  * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
  * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
  * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
  */
 #define	IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
 #define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 
 static zio_t *
 vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 {
 	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
 	uint64_t maxgap = 0;
 	uint64_t size;
 	boolean_t stretch;
 	avl_tree_t *t;
 	enum zio_flag flags;
 
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
 	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
 		return (NULL);
 
 	first = last = zio;
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		maxgap = zfs_vdev_read_gap_limit;
 
 	/*
 	 * We can aggregate I/Os that are sufficiently adjacent and of
 	 * the same flavor, as expressed by the AGG_INHERIT flags.
 	 * The latter requirement is necessary so that certain
 	 * attributes of the I/O, such as whether it's a normal I/O
 	 * or a scrub/resilver, can be preserved in the aggregate.
 	 * We can include optional I/Os, but don't allow them
 	 * to begin a range as they add no benefit in that situation.
 	 */
 
 	/*
 	 * We keep track of the last non-optional I/O.
 	 */
 	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
 
 	/*
 	 * Walk backwards through sufficiently contiguous I/Os
 	 * recording the last non-option I/O.
 	 */
 	flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 	t = vdev_queue_type_tree(vq, zio->io_type);
 	while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
 	    IO_GAP(dio, first) <= maxgap) {
 		first = dio;
 		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
 			mandatory = first;
 	}
 
 	/*
 	 * Skip any initial optional I/Os.
 	 */
 	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
 		first = AVL_NEXT(t, first);
 		ASSERT(first != NULL);
 	}
 
 	/*
 	 * Walk forward through sufficiently contiguous I/Os.
 	 */
 	while ((dio = AVL_NEXT(t, last)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
 	    IO_GAP(last, dio) <= maxgap) {
 		last = dio;
 		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
 			mandatory = last;
 	}
 
 	/*
 	 * Now that we've established the range of the I/O aggregation
 	 * we must decide what to do with trailing optional I/Os.
 	 * For reads, there's nothing to do. While we are unable to
 	 * aggregate further, it's possible that a trailing optional
 	 * I/O would allow the underlying device to aggregate with
 	 * subsequent I/Os. We must therefore determine if the next
 	 * non-optional I/O is close enough to make aggregation
 	 * worthwhile.
 	 */
 	stretch = B_FALSE;
 	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
 		zio_t *nio = last;
 		while ((dio = AVL_NEXT(t, nio)) != NULL &&
 		    IO_GAP(nio, dio) == 0 &&
 		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
 			nio = dio;
 			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
 				stretch = B_TRUE;
 				break;
 			}
 		}
 	}
 
 	if (stretch) {
 		/* This may be a no-op. */
 		dio = AVL_NEXT(t, last);
 		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
 	} else {
 		while (last != mandatory && last != first) {
 			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
 			last = AVL_PREV(t, last);
 			ASSERT(last != NULL);
 		}
 	}
 
 	if (first == last)
 		return (NULL);
 
 	size = IO_SPAN(first, last);
 	ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
 
 	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
 	    zio_buf_alloc(size), size, first->io_type, zio->io_priority,
 	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 	    vdev_queue_agg_io_done, NULL);
 	aio->io_timestamp = first->io_timestamp;
 
 	nio = first;
 	do {
 		dio = nio;
 		nio = AVL_NEXT(t, dio);
 		ASSERT3U(dio->io_type, ==, aio->io_type);
 
 		if (dio->io_flags & ZIO_FLAG_NODATA) {
 			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
 			bzero((char *)aio->io_data + (dio->io_offset -
 			    aio->io_offset), dio->io_size);
 		} else if (dio->io_type == ZIO_TYPE_WRITE) {
 			bcopy(dio->io_data, (char *)aio->io_data +
 			    (dio->io_offset - aio->io_offset),
 			    dio->io_size);
 		}
 
 		zio_add_child(dio, aio);
 		vdev_queue_io_remove(vq, dio);
 		zio_vdev_io_bypass(dio);
 		zio_execute(dio);
 	} while (dio != last);
 
 	return (aio);
 }
 
 static zio_t *
 vdev_queue_io_to_issue(vdev_queue_t *vq)
 {
 	zio_t *zio, *aio;
 	zio_priority_t p;
 	avl_index_t idx;
 	avl_tree_t *tree;
 	zio_t search;
 
 again:
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
 	p = vdev_queue_class_to_issue(vq);
 
 	if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
 		/* No eligible queued i/os */
 		return (NULL);
 	}
 
 	/*
 	 * For LBA-ordered queues (async / scrub), issue the i/o which follows
 	 * the most recently issued i/o in LBA (offset) order.
 	 *
 	 * For FIFO queues (sync), issue the i/o with the lowest timestamp.
 	 */
 	tree = vdev_queue_class_tree(vq, p);
 	search.io_timestamp = 0;
 	search.io_offset = vq->vq_last_offset + 1;
 	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
 	zio = avl_nearest(tree, idx, AVL_AFTER);
 	if (zio == NULL)
 		zio = avl_first(tree);
 	ASSERT3U(zio->io_priority, ==, p);
 
 	aio = vdev_queue_aggregate(vq, zio);
 	if (aio != NULL)
 		zio = aio;
 	else
 		vdev_queue_io_remove(vq, zio);
 
 	/*
 	 * If the I/O is or was optional and therefore has no data, we need to
 	 * simply discard it. We need to drop the vdev queue's lock to avoid a
 	 * deadlock that we could encounter since this I/O will complete
 	 * immediately.
 	 */
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		mutex_exit(&vq->vq_lock);
 		zio_vdev_io_bypass(zio);
 		zio_execute(zio);
 		mutex_enter(&vq->vq_lock);
 		goto again;
 	}
 
 	vdev_queue_pending_add(vq, zio);
 	vq->vq_last_offset = zio->io_offset;
 
 	return (zio);
 }
 
 zio_t *
 vdev_queue_io(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
 
 	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
 		return (zio);
 
 	/*
 	 * Children i/os inherent their parent's priority, which might
 	 * not match the child's i/o type.  Fix it up here.
 	 */
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_SCRUB)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_FREE);
 		zio->io_priority = ZIO_PRIORITY_TRIM;
 	}
 
 	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
 
 	mutex_enter(&vq->vq_lock);
 	zio->io_timestamp = gethrtime();
 	vdev_queue_io_add(vq, zio);
 	nio = vdev_queue_io_to_issue(vq);
 	mutex_exit(&vq->vq_lock);
 
 	if (nio == NULL)
 		return (NULL);
 
 	if (nio->io_done == vdev_queue_agg_io_done) {
 		zio_nowait(nio);
 		return (NULL);
 	}
 
 	return (nio);
 }
 
 void
 vdev_queue_io_done(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
 
 	mutex_enter(&vq->vq_lock);
 
 	vdev_queue_pending_remove(vq, zio);
 
 	vq->vq_io_complete_ts = gethrtime();
 
 	while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
 		mutex_exit(&vq->vq_lock);
 		if (nio->io_done == vdev_queue_agg_io_done) {
 			zio_nowait(nio);
 		} else {
 			zio_vdev_io_reissue(nio);
 			zio_execute(nio);
 		}
 		mutex_enter(&vq->vq_lock);
 	}
 
 	mutex_exit(&vq->vq_lock);
 }
 
 /*
  * As these three methods are only used for load calculations we're not concerned
  * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
  * use here, instead we prefer to keep it lock free for performance.
  */ 
 int
 vdev_queue_length(vdev_t *vd)
 {
 	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
 }
 
 uint64_t
 vdev_queue_lastoffset(vdev_t *vd)
 {
 	return (vd->vdev_queue.vq_lastoffset);
 }
 
 void
 vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
 {
 	vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	(revision 296519)
@@ -1,2406 +1,2407 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #ifdef illumos
 #include <sys/vdev_disk.h>
 #endif
 #include <sys/vdev_file.h>
 #include <sys/vdev_raidz.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/bio.h>
 
 /*
  * Virtual device vector for RAID-Z.
  *
  * This vdev supports single, double, and triple parity. For single parity,
  * we use a simple XOR of all the data columns. For double or triple parity,
  * we use a special case of Reed-Solomon coding. This extends the
  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  * former is also based. The latter is designed to provide higher performance
  * for writes.
  *
  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  * amended six years later identifying a critical flaw that invalidates its
  * claims. Nevertheless, the technique can be adapted to work for up to
  * triple parity. For additional parity, the amendment "Note: Correction to
  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  * is viable, but the additional complexity means that write performance will
  * suffer.
  *
  * All of the methods above operate on a Galois field, defined over the
  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  * can be expressed with a single byte. Briefly, the operations on the
  * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  *   o multiplication of A by 2 is defined by the following bitwise expression:
  *
  *	(A * 2)_7 = A_6
  *	(A * 2)_6 = A_5
  *	(A * 2)_5 = A_4
  *	(A * 2)_4 = A_3 + A_7
  *	(A * 2)_3 = A_2 + A_7
  *	(A * 2)_2 = A_1 + A_7
  *	(A * 2)_1 = A_0
  *	(A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  * As an aside, this multiplication is derived from the error correcting
  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  * than field addition). The inverse of a field element A (A^-1) is therefore
  * A ^ (255 - 1) = A^254.
  *
  * The up-to-three parity columns, P, Q, R over several data columns,
  * D_0, ... D_n-1, can be expressed by field operations:
  *
  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  * independent coefficients. (There are no additional coefficients that have
  * this property which is why the uncorrected Plank method breaks down.)
  *
  * See the reconstruction code below for how P, Q and R can used individually
  * or in concert to recover missing data columns.
  */
 
 typedef struct raidz_col {
 	uint64_t rc_devidx;		/* child device index for I/O */
 	uint64_t rc_offset;		/* device offset */
 	uint64_t rc_size;		/* I/O size */
 	void *rc_data;			/* I/O data */
 	void *rc_gdata;			/* used to store the "good" version */
 	int rc_error;			/* I/O error for this device */
 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
 } raidz_col_t;
 
 typedef struct raidz_map {
 	uint64_t rm_cols;		/* Regular column count */
 	uint64_t rm_scols;		/* Count including skipped columns */
 	uint64_t rm_bigcols;		/* Number of oversized columns */
 	uint64_t rm_asize;		/* Actual total I/O size */
 	uint64_t rm_missingdata;	/* Count of missing data devices */
 	uint64_t rm_missingparity;	/* Count of missing parity devices */
 	uint64_t rm_firstdatacol;	/* First data column/parity count */
 	uint64_t rm_nskip;		/* Skipped sectors for padding */
 	uint64_t rm_skipstart;		/* Column index of padding start */
 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
 	uintptr_t rm_reports;		/* # of referencing checksum reports */
 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
 } raidz_map_t;
 
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
 #define	VDEV_RAIDZ_R		2
 
 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 
 /*
  * We provide a mechanism to perform the field multiplication operation on a
  * 64-bit value all at once rather than a byte at a time. This works by
  * creating a mask from the top bit in each byte and using that to
  * conditionally apply the XOR of 0x1d.
  */
 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
 { \
 	(mask) = (x) & 0x8080808080808080ULL; \
 	(mask) = ((mask) << 1) - ((mask) >> 7); \
 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
 }
 
 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
 { \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
 
 /*
  * Force reconstruction to use the general purpose method.
  */
 int vdev_raidz_default_to_general;
 
 /* Powers of 2 in the Galois field defined above. */
 static const uint8_t vdev_raidz_pow2[256] = {
 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 };
 /* Logs of 2 in the Galois field defined above. */
 static const uint8_t vdev_raidz_log2[256] = {
 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 };
 
 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 
 /*
  * Multiply a given number by 2 raised to the given power.
  */
 static uint8_t
 vdev_raidz_exp2(uint_t a, int exp)
 {
 	if (a == 0)
 		return (0);
 
 	ASSERT(exp >= 0);
 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 
 	exp += vdev_raidz_log2[a];
 	if (exp > 255)
 		exp -= 255;
 
 	return (vdev_raidz_pow2[exp]);
 }
 
 static void
 vdev_raidz_map_free(raidz_map_t *rm)
 {
 	int c;
 	size_t size;
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		if (rm->rm_col[c].rc_data != NULL)
 			zio_buf_free(rm->rm_col[c].rc_data,
 			    rm->rm_col[c].rc_size);
 
 		if (rm->rm_col[c].rc_gdata != NULL)
 			zio_buf_free(rm->rm_col[c].rc_gdata,
 			    rm->rm_col[c].rc_size);
 	}
 
 	size = 0;
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 		size += rm->rm_col[c].rc_size;
 
 	if (rm->rm_datacopy != NULL)
 		zio_buf_free(rm->rm_datacopy, size);
 
 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 }
 
 static void
 vdev_raidz_map_free_vsd(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	ASSERT0(rm->rm_freed);
 	rm->rm_freed = 1;
 
 	if (rm->rm_reports == 0)
 		vdev_raidz_map_free(rm);
 }
 
 /*ARGSUSED*/
 static void
 vdev_raidz_cksum_free(void *arg, size_t ignored)
 {
 	raidz_map_t *rm = arg;
 
 	ASSERT3U(rm->rm_reports, >, 0);
 
 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 		vdev_raidz_map_free(rm);
 }
 
 static void
 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 {
 	raidz_map_t *rm = zcr->zcr_cbdata;
 	size_t c = zcr->zcr_cbinfo;
 	size_t x;
 
 	const char *good = NULL;
 	const char *bad = rm->rm_col[c].rc_data;
 
 	if (good_data == NULL) {
 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 		return;
 	}
 
 	if (c < rm->rm_firstdatacol) {
 		/*
 		 * The first time through, calculate the parity blocks for
 		 * the good data (this relies on the fact that the good
 		 * data never changes for a given logical ZIO)
 		 */
 		if (rm->rm_col[0].rc_gdata == NULL) {
 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 			char *buf;
 
 			/*
 			 * Set up the rm_col[]s to generate the parity for
 			 * good_data, first saving the parity bufs and
 			 * replacing them with buffers to hold the result.
 			 */
 			for (x = 0; x < rm->rm_firstdatacol; x++) {
 				bad_parity[x] = rm->rm_col[x].rc_data;
 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 				    zio_buf_alloc(rm->rm_col[x].rc_size);
 			}
 
 			/* fill in the data columns from good_data */
 			buf = (char *)good_data;
 			for (; x < rm->rm_cols; x++) {
 				rm->rm_col[x].rc_data = buf;
 				buf += rm->rm_col[x].rc_size;
 			}
 
 			/*
 			 * Construct the parity from the good data.
 			 */
 			vdev_raidz_generate_parity(rm);
 
 			/* restore everything back to its original state */
 			for (x = 0; x < rm->rm_firstdatacol; x++)
 				rm->rm_col[x].rc_data = bad_parity[x];
 
 			buf = rm->rm_datacopy;
 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 				rm->rm_col[x].rc_data = buf;
 				buf += rm->rm_col[x].rc_size;
 			}
 		}
 
 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 		good = rm->rm_col[c].rc_gdata;
 	} else {
 		/* adjust good_data to point at the start of our column */
 		good = good_data;
 
 		for (x = rm->rm_firstdatacol; x < c; x++)
 			good += rm->rm_col[x].rc_size;
 	}
 
 	/* we drop the ereport if it ends up that the data was good */
 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 }
 
 /*
  * Invoked indirectly by zfs_ereport_start_checksum(), called
  * below when our read operation fails completely.  The main point
  * is to keep a copy of everything we read from disk, so that at
  * vdev_raidz_cksum_finish() time we can compare it with the good data.
  */
 static void
 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 {
 	size_t c = (size_t)(uintptr_t)arg;
 	caddr_t buf;
 
 	raidz_map_t *rm = zio->io_vsd;
 	size_t size;
 
 	/* set up the report and bump the refcount  */
 	zcr->zcr_cbdata = rm;
 	zcr->zcr_cbinfo = c;
 	zcr->zcr_finish = vdev_raidz_cksum_finish;
 	zcr->zcr_free = vdev_raidz_cksum_free;
 
 	rm->rm_reports++;
 	ASSERT3U(rm->rm_reports, >, 0);
 
 	if (rm->rm_datacopy != NULL)
 		return;
 
 	/*
 	 * It's the first time we're called for this raidz_map_t, so we need
 	 * to copy the data aside; there's no guarantee that our zio's buffer
 	 * won't be re-used for something else.
 	 *
 	 * Our parity data is already in separate buffers, so there's no need
 	 * to copy them.
 	 */
 
 	size = 0;
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 		size += rm->rm_col[c].rc_size;
 
 	buf = rm->rm_datacopy = zio_buf_alloc(size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		raidz_col_t *col = &rm->rm_col[c];
 
 		bcopy(col->rc_data, buf, col->rc_size);
 		col->rc_data = buf;
 
 		buf += col->rc_size;
 	}
 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 }
 
 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 	vdev_raidz_map_free_vsd,
 	vdev_raidz_cksum_report
 };
 
 /*
  * Divides the IO evenly across all child vdevs; usually, dcols is
  * the number of children in the target vdev.
  */
 static raidz_map_t *
 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 {
 	raidz_map_t *rm;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = offset >> unit_shift;
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = size >> unit_shift;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 	/* The starting byte offset on each child vdev. */
 	uint64_t o = (b / dcols) << unit_shift;
 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 */
 	q = s / (dcols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	r = s - q * (dcols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
 	bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/* acols: The columns that will be accessed. */
 	/* scols: The columns that will be accessed or skipped. */
 	if (q == 0) {
 		/* Our I/O request doesn't span all child vdevs. */
 		acols = bc;
 		scols = MIN(dcols, roundup(bc, nparity + 1));
 	} else {
 		acols = dcols;
 		scols = dcols;
 	}
 
 	ASSERT3U(acols, <=, scols);
 
 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 
 	rm->rm_cols = acols;
 	rm->rm_scols = scols;
 	rm->rm_bigcols = bc;
 	rm->rm_skipstart = bc;
 	rm->rm_missingdata = 0;
 	rm->rm_missingparity = 0;
 	rm->rm_firstdatacol = nparity;
 	rm->rm_datacopy = NULL;
 	rm->rm_reports = 0;
 	rm->rm_freed = 0;
 	rm->rm_ecksuminjected = 0;
 
 	asize = 0;
 
 	for (c = 0; c < scols; c++) {
 		col = f + c;
 		coff = o;
 		if (col >= dcols) {
 			col -= dcols;
 			coff += 1ULL << unit_shift;
 		}
 		rm->rm_col[c].rc_devidx = col;
 		rm->rm_col[c].rc_offset = coff;
 		rm->rm_col[c].rc_data = NULL;
 		rm->rm_col[c].rc_gdata = NULL;
 		rm->rm_col[c].rc_error = 0;
 		rm->rm_col[c].rc_tried = 0;
 		rm->rm_col[c].rc_skipped = 0;
 
 		if (c >= acols)
 			rm->rm_col[c].rc_size = 0;
 		else if (c < bc)
 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 		else
 			rm->rm_col[c].rc_size = q << unit_shift;
 
 		asize += rm->rm_col[c].rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << unit_shift);
 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 	ASSERT3U(rm->rm_nskip, <=, nparity);
 
 	if (!dofree) {
 		for (c = 0; c < rm->rm_firstdatacol; c++) {
 			rm->rm_col[c].rc_data =
 			    zio_buf_alloc(rm->rm_col[c].rc_size);
 		}
 
 		rm->rm_col[c].rc_data = data;
 
 		for (c = c + 1; c < acols; c++) {
 			rm->rm_col[c].rc_data =
 			    (char *)rm->rm_col[c - 1].rc_data +
 			    rm->rm_col[c - 1].rc_size;
 		}
 	}
 
 	/*
 	 * If all data stored spans all columns, there's a danger that parity
 	 * will always be on the same device and, since parity isn't read
 	 * during normal operation, that that device's I/O bandwidth won't be
 	 * used effectively. We therefore switch the parity every 1MB.
 	 *
 	 * ... at least that was, ostensibly, the theory. As a practical
 	 * matter unless we juggle the parity between all devices evenly, we
 	 * won't see any benefit. Further, occasional writes that aren't a
 	 * multiple of the LCM of the number of children and the minimum
 	 * stripe width are sufficient to avoid pessimal behavior.
 	 * Unfortunately, this decision created an implicit on-disk format
 	 * requirement that we need to support for all eternity, but only
 	 * for single-parity RAID-Z.
 	 *
 	 * If we intend to skip a sector in the zeroth column for padding
 	 * we must make sure to note this swap. We will never intend to
 	 * skip the first column since at least one data and one parity
 	 * column must appear in each row.
 	 */
 	ASSERT(rm->rm_cols >= 2);
 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 
 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 		devidx = rm->rm_col[0].rc_devidx;
 		o = rm->rm_col[0].rc_offset;
 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 		rm->rm_col[1].rc_devidx = devidx;
 		rm->rm_col[1].rc_offset = o;
 
 		if (rm->rm_skipstart == 0)
 			rm->rm_skipstart = 1;
 	}
 
 	return (rm);
 }
 
 static void
 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 {
 	uint64_t *p, *src, pcount, ccount, i;
 	int c;
 
 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		if (c == rm->rm_firstdatacol) {
 			ASSERT(ccount == pcount);
 			for (i = 0; i < ccount; i++, src++, p++) {
 				*p = *src;
 			}
 		} else {
 			ASSERT(ccount <= pcount);
 			for (i = 0; i < ccount; i++, src++, p++) {
 				*p ^= *src;
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 {
 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 	int c;
 
 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 
 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		if (c == rm->rm_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
 				*p = *src;
 				*q = *src;
 			}
 			for (; i < pcnt; i++, src++, p++, q++) {
 				*p = 0;
 				*q = 0;
 			}
 		} else {
 			ASSERT(ccnt <= pcnt);
 
 			/*
 			 * Apply the algorithm described above by multiplying
 			 * the previous result and adding in the new value.
 			 */
 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
 				*p ^= *src;
 
 				VDEV_RAIDZ_64MUL_2(*q, mask);
 				*q ^= *src;
 			}
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			for (; i < pcnt; i++, q++) {
 				VDEV_RAIDZ_64MUL_2(*q, mask);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 {
 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 	int c;
 
 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 
 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		if (c == rm->rm_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 				*p = *src;
 				*q = *src;
 				*r = *src;
 			}
 			for (; i < pcnt; i++, src++, p++, q++, r++) {
 				*p = 0;
 				*q = 0;
 				*r = 0;
 			}
 		} else {
 			ASSERT(ccnt <= pcnt);
 
 			/*
 			 * Apply the algorithm described above by multiplying
 			 * the previous result and adding in the new value.
 			 */
 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 				*p ^= *src;
 
 				VDEV_RAIDZ_64MUL_2(*q, mask);
 				*q ^= *src;
 
 				VDEV_RAIDZ_64MUL_4(*r, mask);
 				*r ^= *src;
 			}
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			for (; i < pcnt; i++, q++, r++) {
 				VDEV_RAIDZ_64MUL_2(*q, mask);
 				VDEV_RAIDZ_64MUL_4(*r, mask);
 			}
 		}
 	}
 }
 
 /*
  * Generate RAID parity in the first virtual columns according to the number of
  * parity columns available.
  */
 static void
 vdev_raidz_generate_parity(raidz_map_t *rm)
 {
 	switch (rm->rm_firstdatacol) {
 	case 1:
 		vdev_raidz_generate_parity_p(rm);
 		break;
 	case 2:
 		vdev_raidz_generate_parity_pq(rm);
 		break;
 	case 3:
 		vdev_raidz_generate_parity_pqr(rm);
 		break;
 	default:
 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 	}
 }
 
 static int
 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, i;
 	int x = tgts[0];
 	int c;
 
 	ASSERT(ntgts == 1);
 	ASSERT(x >= rm->rm_firstdatacol);
 	ASSERT(x < rm->rm_cols);
 
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 	ASSERT(xcount > 0);
 
 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 	dst = rm->rm_col[x].rc_data;
 	for (i = 0; i < xcount; i++, dst++, src++) {
 		*dst = *src;
 	}
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		dst = rm->rm_col[x].rc_data;
 
 		if (c == x)
 			continue;
 
 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 		count = MIN(ccount, xcount);
 
 		for (i = 0; i < count; i++, dst++, src++) {
 			*dst ^= *src;
 		}
 	}
 
 	return (1 << VDEV_RAIDZ_P);
 }
 
 static int
 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
 	uint8_t *b;
 	int x = tgts[0];
 	int c, j, exp;
 
 	ASSERT(ntgts == 1);
 
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		dst = rm->rm_col[x].rc_data;
 
 		if (c == x)
 			ccount = 0;
 		else
 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		count = MIN(ccount, xcount);
 
 		if (c == rm->rm_firstdatacol) {
 			for (i = 0; i < count; i++, dst++, src++) {
 				*dst = *src;
 			}
 			for (; i < xcount; i++, dst++) {
 				*dst = 0;
 			}
 
 		} else {
 			for (i = 0; i < count; i++, dst++, src++) {
 				VDEV_RAIDZ_64MUL_2(*dst, mask);
 				*dst ^= *src;
 			}
 
 			for (; i < xcount; i++, dst++) {
 				VDEV_RAIDZ_64MUL_2(*dst, mask);
 			}
 		}
 	}
 
 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 	dst = rm->rm_col[x].rc_data;
 	exp = 255 - (rm->rm_cols - 1 - x);
 
 	for (i = 0; i < xcount; i++, dst++, src++) {
 		*dst ^= *src;
 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 			*b = vdev_raidz_exp2(*b, exp);
 		}
 	}
 
 	return (1 << VDEV_RAIDZ_Q);
 }
 
 static int
 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 	void *pdata, *qdata;
 	uint64_t xsize, ysize, i;
 	int x = tgts[0];
 	int y = tgts[1];
 
 	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rm->rm_firstdatacol);
 	ASSERT(y < rm->rm_cols);
 
 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 
 	/*
 	 * Move the parity data aside -- we're going to compute parity as
 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 	 * reuse the parity generation mechanism without trashing the actual
 	 * parity so we make those columns appear to be full of zeros by
 	 * setting their lengths to zero.
 	 */
 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 	xsize = rm->rm_col[x].rc_size;
 	ysize = rm->rm_col[y].rc_size;
 
 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 	rm->rm_col[x].rc_size = 0;
 	rm->rm_col[y].rc_size = 0;
 
 	vdev_raidz_generate_parity_pq(rm);
 
 	rm->rm_col[x].rc_size = xsize;
 	rm->rm_col[y].rc_size = ysize;
 
 	p = pdata;
 	q = qdata;
 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 	xd = rm->rm_col[x].rc_data;
 	yd = rm->rm_col[y].rc_data;
 
 	/*
 	 * We now have:
 	 *	Pxy = P + D_x + D_y
 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 	 *
 	 * We can then solve for D_x:
 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
 	 * where
 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 	 *
 	 * With D_x in hand, we can easily solve for D_y:
 	 *	D_y = P + Pxy + D_x
 	 */
 
 	a = vdev_raidz_pow2[255 + x - y];
 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 	tmp = 255 - vdev_raidz_log2[a ^ 1];
 
 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 
 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
 
 		if (i < ysize)
 			*yd = *p ^ *pxy ^ *xd;
 	}
 
 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
 	/*
 	 * Restore the saved parity data.
 	 */
 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 
 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 }
 
 /* BEGIN CSTYLED */
 /*
  * In the general case of reconstruction, we must solve the system of linear
  * equations defined by the coeffecients used to generate parity as well as
  * the contents of the data and parity disks. This can be expressed with
  * vectors for the original data (D) and the actual data (d) and parity (p)
  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
  *
  *            __   __                     __     __
  *            |     |         __     __   |  p_0  |
  *            |  V  |         |  D_0  |   | p_m-1 |
  *            |     |    x    |   :   | = |  d_0  |
  *            |  I  |         | D_n-1 |   |   :   |
  *            |     |         ~~     ~~   | d_n-1 |
  *            ~~   ~~                     ~~     ~~
  *
  * I is simply a square identity matrix of size n, and V is a vandermonde
  * matrix defined by the coeffecients we chose for the various parity columns
  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
  * computation as well as linear separability.
  *
  *      __               __               __     __
  *      |   1   ..  1 1 1 |               |  p_0  |
  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
  *      |   :       : : : |   |   :   |   |  d_2  |
  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
  *      |   0   ..  0 0 1 |               | d_n-1 |
  *      ~~               ~~               ~~     ~~
  *
  * Note that I, V, d, and p are known. To compute D, we must invert the
  * matrix and use the known data and parity values to reconstruct the unknown
  * data values. We begin by removing the rows in V|I and d|p that correspond
  * to failed or missing columns; we then make V|I square (n x n) and d|p
  * sized n by removing rows corresponding to unused parity from the bottom up
  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
  * using Gauss-Jordan elimination. In the example below we use m=3 parity
  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
  *           |  19 205 116  29  64  16  4   1  |      / /
  *           |  1   0   0   0   0   0   0   0  |     / /
  *           |  0   1   0   0   0   0   0   0  | <--' /
  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *
  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
  * matrix is not singular.
  * __                                                                 __
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  *                   __                               __
  *                   |  0   0   1   0   0   0   0   0  |
  *                   | 167 100  5   41 159 169 217 208 |
  *                   | 166 100  4   40 158 168 216 209 |
  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
  *                   |  0   0   0   0   1   0   0   0  |
  *                   |  0   0   0   0   0   1   0   0  |
  *                   |  0   0   0   0   0   0   1   0  |
  *                   |  0   0   0   0   0   0   0   1  |
  *                   ~~                               ~~
  *
  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
  * of the missing data.
  *
  * As is apparent from the example above, the only non-trivial rows in the
  * inverse matrix correspond to the data disks that we're trying to
  * reconstruct. Indeed, those are the only rows we need as the others would
  * only be useful for reconstructing data known or assumed to be valid. For
  * that reason, we only build the coefficients in the rows that correspond to
  * targeted columns.
  */
 /* END CSTYLED */
 
 static void
 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
     uint8_t **rows)
 {
 	int i, j;
 	int pow;
 
 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
 
 	/*
 	 * Fill in the missing rows of interest.
 	 */
 	for (i = 0; i < nmap; i++) {
 		ASSERT3S(0, <=, map[i]);
 		ASSERT3S(map[i], <=, 2);
 
 		pow = map[i] * n;
 		if (pow > 255)
 			pow -= 255;
 		ASSERT(pow <= 255);
 
 		for (j = 0; j < n; j++) {
 			pow -= map[i];
 			if (pow < 0)
 				pow += 255;
 			rows[i][j] = vdev_raidz_pow2[pow];
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, ii, jj;
 	uint8_t log;
 
 	/*
 	 * Assert that the first nmissing entries from the array of used
 	 * columns correspond to parity columns and that subsequent entries
 	 * correspond to data columns.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
 	}
 	for (; i < n; i++) {
 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
 	}
 
 	/*
 	 * First initialize the storage where we'll compute the inverse rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			invrows[i][j] = (i == j) ? 1 : 0;
 		}
 	}
 
 	/*
 	 * Subtract all trivial rows from the rows of consequence.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = nmissing; j < n; j++) {
 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
 			jj = used[j] - rm->rm_firstdatacol;
 			ASSERT3S(jj, <, n);
 			invrows[i][j] = rows[i][jj];
 			rows[i][jj] = 0;
 		}
 	}
 
 	/*
 	 * For each of the rows of interest, we must normalize it and subtract
 	 * a multiple of it from the other rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < missing[i]; j++) {
 			ASSERT0(rows[i][j]);
 		}
 		ASSERT3U(rows[i][missing[i]], !=, 0);
 
 		/*
 		 * Compute the inverse of the first element and multiply each
 		 * element in the row by that value.
 		 */
 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 
 		for (j = 0; j < n; j++) {
 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 		}
 
 		for (ii = 0; ii < nmissing; ii++) {
 			if (i == ii)
 				continue;
 
 			ASSERT3U(rows[ii][missing[i]], !=, 0);
 
 			log = vdev_raidz_log2[rows[ii][missing[i]]];
 
 			for (j = 0; j < n; j++) {
 				rows[ii][j] ^=
 				    vdev_raidz_exp2(rows[i][j], log);
 				invrows[ii][j] ^=
 				    vdev_raidz_exp2(invrows[i][j], log);
 			}
 		}
 	}
 
 	/*
 	 * Verify that the data that is left in the rows are properly part of
 	 * an identity matrix.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			if (j == missing[i]) {
 				ASSERT3U(rows[i][j], ==, 1);
 			} else {
 				ASSERT0(rows[i][j]);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
     int *missing, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, x, cc, c;
 	uint8_t *src;
 	uint64_t ccount;
 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
 	uint8_t log = 0;
 	uint8_t val;
 	int ll;
 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 
 	psize = sizeof (invlog[0][0]) * n * nmissing;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing; i++) {
 		invlog[i] = pp;
 		pp += n;
 	}
 
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			ASSERT3U(invrows[i][j], !=, 0);
 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 		}
 	}
 
 	for (i = 0; i < n; i++) {
 		c = used[i];
 		ASSERT3U(c, <, rm->rm_cols);
 
 		src = rm->rm_col[c].rc_data;
 		ccount = rm->rm_col[c].rc_size;
 		for (j = 0; j < nmissing; j++) {
 			cc = missing[j] + rm->rm_firstdatacol;
 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
 			ASSERT3U(cc, <, rm->rm_cols);
 			ASSERT3U(cc, !=, c);
 
 			dst[j] = rm->rm_col[cc].rc_data;
 			dcount[j] = rm->rm_col[cc].rc_size;
 		}
 
 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
 
 		for (x = 0; x < ccount; x++, src++) {
 			if (*src != 0)
 				log = vdev_raidz_log2[*src];
 
 			for (cc = 0; cc < nmissing; cc++) {
 				if (x >= dcount[cc])
 					continue;
 
 				if (*src == 0) {
 					val = 0;
 				} else {
 					if ((ll = log + invlog[cc][i]) >= 255)
 						ll -= 255;
 					val = vdev_raidz_pow2[ll];
 				}
 
 				if (i == 0)
 					dst[cc][x] = val;
 				else
 					dst[cc][x] ^= val;
 			}
 		}
 	}
 
 	kmem_free(p, psize);
 }
 
 static int
 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	int n, i, c, t, tt;
 	int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 
 	uint8_t *p, *pp;
 	size_t psize;
 
 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *used;
 
 	int code = 0;
 
 
 	n = rm->rm_cols - rm->rm_firstdatacol;
 
 	/*
 	 * Figure out which data columns are missing.
 	 */
 	nmissing_rows = 0;
 	for (t = 0; t < ntgts; t++) {
 		if (tgts[t] >= rm->rm_firstdatacol) {
 			missing_rows[nmissing_rows++] =
 			    tgts[t] - rm->rm_firstdatacol;
 		}
 	}
 
 	/*
 	 * Figure out which parity columns to use to help generate the missing
 	 * data columns.
 	 */
 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
 		ASSERT(tt < ntgts);
 		ASSERT(c < rm->rm_firstdatacol);
 
 		/*
 		 * Skip any targeted parity columns.
 		 */
 		if (c == tgts[tt]) {
 			tt++;
 			continue;
 		}
 
 		code |= 1 << c;
 
 		parity_map[i] = c;
 		i++;
 	}
 
 	ASSERT(code != 0);
 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
 
 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
 	    nmissing_rows * n + sizeof (used[0]) * n;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing_rows; i++) {
 		rows[i] = pp;
 		pp += n;
 		invrows[i] = pp;
 		pp += n;
 	}
 	used = pp;
 
 	for (i = 0; i < nmissing_rows; i++) {
 		used[i] = parity_map[i];
 	}
 
 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		if (tt < nmissing_rows &&
 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
 			tt++;
 			continue;
 		}
 
 		ASSERT3S(i, <, n);
 		used[i] = c;
 		i++;
 	}
 
 	/*
 	 * Initialize the interesting rows of the matrix.
 	 */
 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
 
 	/*
 	 * Invert the matrix.
 	 */
 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
 	    invrows, used);
 
 	/*
 	 * Reconstruct the missing data using the generated matrix.
 	 */
 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
 	    invrows, used);
 
 	kmem_free(p, psize);
 
 	return (code);
 }
 
 static int
 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
 {
 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
 	int ntgts;
 	int i, c;
 	int code;
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
 
 	/*
 	 * The tgts list must already be sorted.
 	 */
 	for (i = 1; i < nt; i++) {
 		ASSERT(t[i] > t[i - 1]);
 	}
 
 	nbadparity = rm->rm_firstdatacol;
 	nbaddata = rm->rm_cols - nbadparity;
 	ntgts = 0;
 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
 		if (c < rm->rm_firstdatacol)
 			parity_valid[c] = B_FALSE;
 
 		if (i < nt && c == t[i]) {
 			tgts[ntgts++] = c;
 			i++;
 		} else if (rm->rm_col[c].rc_error != 0) {
 			tgts[ntgts++] = c;
 		} else if (c >= rm->rm_firstdatacol) {
 			nbaddata--;
 		} else {
 			parity_valid[c] = B_TRUE;
 			nbadparity--;
 		}
 	}
 
 	ASSERT(ntgts >= nt);
 	ASSERT(nbaddata >= 0);
 	ASSERT(nbaddata + nbadparity == ntgts);
 
 	dt = &tgts[nbadparity];
 
 	/*
 	 * See if we can use any of our optimized reconstruction routines.
 	 */
 	if (!vdev_raidz_default_to_general) {
 		switch (nbaddata) {
 		case 1:
 			if (parity_valid[VDEV_RAIDZ_P])
 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
 
 			ASSERT(rm->rm_firstdatacol > 1);
 
 			if (parity_valid[VDEV_RAIDZ_Q])
 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
 
 			ASSERT(rm->rm_firstdatacol > 2);
 			break;
 
 		case 2:
 			ASSERT(rm->rm_firstdatacol > 1);
 
 			if (parity_valid[VDEV_RAIDZ_P] &&
 			    parity_valid[VDEV_RAIDZ_Q])
 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
 
 			ASSERT(rm->rm_firstdatacol > 2);
 
 			break;
 		}
 	}
 
 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
 	ASSERT(code > 0);
 	return (code);
 }
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_t *cvd;
 	uint64_t nparity = vd->vdev_nparity;
 	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
 	ASSERT(nparity > 0);
 
 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
 
 	for (c = 0; c < vd->vdev_children; c++) {
 		cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
 		*physical_ashift = MAX(*physical_ashift,
 		    cvd->vdev_physical_ashift);
 	}
 
 	*asize *= vd->vdev_children;
 	*max_asize *= vd->vdev_children;
 
 	if (numerrors > nparity) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_close(vdev_t *vd)
 {
 	int c;
 
 	for (c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
 #ifdef illumos
 /*
  * Handle a read or write I/O to a RAID-Z dump device.
  *
  * The dump device is in a unique situation compared to other ZFS datasets:
  * writing to this device should be as simple and fast as possible.  In
  * addition, durability matters much less since the dump will be extracted
  * once the machine reboots.  For that reason, this function eschews parity for
  * performance and simplicity.  The dump device uses the checksum setting
  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
  * dataset.
  *
  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
  * 128 KB will not fill an entire block; in addition, they may not be properly
  * aligned.  In that case, this function uses the preallocated 128 KB block and
  * omits reading or writing any "empty" portions of that block, as opposed to
  * allocating a fresh appropriately-sized block.
  *
  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
  *
  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
  *
  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
  * allocated which spans all five child vdevs.  8 KB of data would be written to
  * each of four vdevs, with the fifth containing the parity bits.
  *
  *       parity    data     data     data     data
  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
  *         ^        ^        ^        ^        ^
  *         |        |        |        |        |
  *   8 KB parity    ------8 KB data blocks------
  *
  * However, when writing to the dump device, the behavior is different:
  *
  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
  *
  * Unlike the normal RAID-Z case in which the block is allocated based on the
  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
  * I/O size is less than 128 KB, only the actual portions of data are written.
  * In this example the data is written to the third data vdev since that vdev
  * contains the offset [64 KB, 96 KB).
  *
  *       parity    data     data     data     data
  *     |        |        |        |   XX   |        |
  *                                    ^
  *                                    |
  *                             32 KB data block
  *
  * As a result, an individual I/O may not span all child vdevs; moreover, a
  * small I/O may only operate on a single child vdev.
  *
  * Note that since there are no parity bits calculated or written, this format
  * remains the same no matter how many parity bits are used in a normal RAID-Z
  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
  * would look like:
  *
  *       parity   parity   parity    data     data     data     data
  *     |        |        |        |        |        |   XX   |        |
  *                                                      ^
  *                                                      |
  *                                               32 KB data block
  */
 int
 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
 {
 	vdev_t *tvd = vd->vdev_top;
 	vdev_t *cvd;
 	raidz_map_t *rm;
 	raidz_col_t *rc;
 	int c, err = 0;
 
 	uint64_t start, end, colstart, colend;
 	uint64_t coloffset, colsize, colskip;
 
 	int flags = doread ? BIO_READ : BIO_WRITE;
 
 #ifdef	_KERNEL
 
 	/*
 	 * Don't write past the end of the block
 	 */
 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
 
 	start = offset;
 	end = start + size;
 
 	/*
 	 * Allocate a RAID-Z map for this block.  Note that this block starts
 	 * from the "original" offset, this is, the offset of the extent which
 	 * contains the requisite offset of the data being read or written.
 	 *
 	 * Even if this I/O operation doesn't span the full block size, let's
 	 * treat the on-disk format as if the only blocks are the complete 128
 	 * KB size.
 	 */
 	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
 	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
 	    vd->vdev_children, vd->vdev_nparity);
 
 	coloffset = origoffset;
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
 	    c++, coloffset += rc->rc_size) {
 		rc = &rm->rm_col[c];
 		cvd = vd->vdev_child[rc->rc_devidx];
 
 		/*
 		 * Find the start and end of this column in the RAID-Z map,
 		 * keeping in mind that the stated size and offset of the
 		 * operation may not fill the entire column for this vdev.
 		 *
 		 * If any portion of the data spans this column, issue the
 		 * appropriate operation to the vdev.
 		 */
 		if (coloffset + rc->rc_size <= start)
 			continue;
 		if (coloffset >= end)
 			continue;
 
 		colstart = MAX(coloffset, start);
 		colend = MIN(end, coloffset + rc->rc_size);
 		colsize = colend - colstart;
 		colskip = colstart - coloffset;
 
 		VERIFY3U(colsize, <=, rc->rc_size);
 		VERIFY3U(colskip, <=, rc->rc_size);
 
 		/*
 		 * Note that the child vdev will have a vdev label at the start
 		 * of its range of offsets, hence the need for
 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
 		 * example of why this calculation is needed.
 		 */
 		if ((err = vdev_disk_physio(cvd,
 		    ((char *)rc->rc_data) + colskip, colsize,
 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
 		    flags, isdump)) != 0)
 			break;
 	}
 
 	vdev_raidz_map_free(rm);
 #endif	/* KERNEL */
 
 	return (err);
 }
 #endif
 
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
 	uint64_t asize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t cols = vd->vdev_children;
 	uint64_t nparity = vd->vdev_nparity;
 
 	asize = ((psize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
 	asize = roundup(asize, nparity + 1) << ashift;
 
 	return (asize);
 }
 
 static void
 vdev_raidz_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	rc->rc_error = zio->io_error;
 	rc->rc_tried = 1;
 	rc->rc_skipped = 0;
 }
 
 /*
  * Start an IO operation on a RAIDZ VDev
  *
  * Outline:
  * - For write operations:
  *   1. Generate the parity data
  *   2. Create child zio write operations to each column's vdev, for both
  *      data and parity.
  *   3. If the column skips any sectors for padding, create optional dummy
  *      write zio children for those areas to improve aggregation continuity.
  * - For read operations:
  *   1. Create child zio read operations to each data column's vdev to read
  *      the range of data required for zio.
  *   2. If this is a scrub or resilver operation, or if any of the data
  *      vdevs have had errors, then create zio read operations to the parity
  *      columns' VDevs as well.
  */
 static void
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_t *cvd;
 	raidz_map_t *rm;
 	raidz_col_t *rc;
 	int c, i;
 
 	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
 	    zio->io_type == ZIO_TYPE_FREE,
 	    tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
 
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_FREE) {
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 
 		zio_execute(zio);
 		return;
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_raidz_generate_parity(rm);
 
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 
 		/*
 		 * Generate optional I/Os for any skipped sectors to improve
 		 * aggregation contiguity.
 		 */
 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
 			ASSERT(c <= rm->rm_scols);
 			if (c == rm->rm_scols)
 				c = 0;
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset + rc->rc_size, NULL,
 			    1 << tvd->vdev_ashift,
 			    zio->io_type, zio->io_priority,
 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 		}
 
 		zio_execute(zio);
 		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last -- any errors along the way will force us to read the parity.
 	 */
 	for (c = rm->rm_cols - 1; c >= 0; c--) {
 		rc = &rm->rm_col[c];
 		cvd = vd->vdev_child[rc->rc_devidx];
 		if (!vdev_readable(cvd)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
 				rm->rm_missingparity++;
 			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;	/* don't even try */
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
 				rm->rm_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 
 	zio_execute(zio);
 }
 
 
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
 static void
 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
 {
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		zio_bad_cksum_t zbc;
 		raidz_map_t *rm = zio->io_vsd;
 
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_checksum_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
 
 		zbc.zbc_has_cksum = 0;
 		zbc.zbc_injected = rm->rm_ecksuminjected;
 
 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
 		    &zbc);
 	}
 }
 
 /*
  * We keep track of whether or not there were any injected errors, so that
  * any ereports we generate can note it.
  */
 static int
 raidz_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t zbc;
 	raidz_map_t *rm = zio->io_vsd;
 
 	int ret = zio_checksum_error(zio, &zbc);
 	if (ret != 0 && zbc.zbc_injected != 0)
 		rm->rm_ecksuminjected = 1;
 
 	return (ret);
 }
 
 /*
  * Generate the parity from the data columns. If we tried and were able to
  * read the parity without error, verify that the generated parity matches the
  * data we read. If it doesn't, we fire off a checksum error. Return the
  * number such failures.
  */
 static int
 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 {
 	void *orig[VDEV_RAIDZ_MAXPARITY];
 	int c, ret = 0;
 	raidz_col_t *rc;
 
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 
 	if (checksum == ZIO_CHECKSUM_NOPARITY)
 		return (ret);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 		orig[c] = zio_buf_alloc(rc->rc_size);
 		bcopy(rc->rc_data, orig[c], rc->rc_size);
 	}
 
 	vdev_raidz_generate_parity(rm);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
 			raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
 		zio_buf_free(orig[c], rc->rc_size);
 	}
 
 	return (ret);
 }
 
 /*
  * Keep statistics on all the ways that we used parity to correct data.
  */
 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
 
 static int
 vdev_raidz_worst_error(raidz_map_t *rm)
 {
 	int error = 0;
 
 	for (int c = 0; c < rm->rm_cols; c++)
 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
 
 	return (error);
 }
 
 /*
  * Iterate over all combinations of bad data and attempt a reconstruction.
  * Note that the algorithm below is non-optimal because it doesn't take into
  * account how reconstruction is actually performed. For example, with
  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
  * is targeted as invalid as if columns 1 and 4 are targeted since in both
  * cases we'd only use parity information in column 0.
  */
 static int
 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
 {
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
 	void *orig[VDEV_RAIDZ_MAXPARITY];
 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
 	int *tgts = &tstore[1];
 	int current, next, i, c, n;
 	int code, ret = 0;
 
 	ASSERT(total_errors < rm->rm_firstdatacol);
 
 	/*
 	 * This simplifies one edge condition.
 	 */
 	tgts[-1] = -1;
 
 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
 		/*
 		 * Initialize the targets array by finding the first n columns
 		 * that contain no error.
 		 *
 		 * If there were no data errors, we need to ensure that we're
 		 * always explicitly attempting to reconstruct at least one
 		 * data column. To do this, we simply push the highest target
 		 * up into the data columns.
 		 */
 		for (c = 0, i = 0; i < n; i++) {
 			if (i == n - 1 && data_errors == 0 &&
 			    c < rm->rm_firstdatacol) {
 				c = rm->rm_firstdatacol;
 			}
 
 			while (rm->rm_col[c].rc_error != 0) {
 				c++;
 				ASSERT3S(c, <, rm->rm_cols);
 			}
 
 			tgts[i] = c++;
 		}
 
 		/*
 		 * Setting tgts[n] simplifies the other edge condition.
 		 */
 		tgts[n] = rm->rm_cols;
 
 		/*
 		 * These buffers were allocated in previous iterations.
 		 */
 		for (i = 0; i < n - 1; i++) {
 			ASSERT(orig[i] != NULL);
 		}
 
 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
 
 		current = 0;
 		next = tgts[current];
 
 		while (current != n) {
 			tgts[current] = next;
 			current = 0;
 
 			/*
 			 * Save off the original data that we're going to
 			 * attempt to reconstruct.
 			 */
 			for (i = 0; i < n; i++) {
 				ASSERT(orig[i] != NULL);
 				c = tgts[i];
 				ASSERT3S(c, >=, 0);
 				ASSERT3S(c, <, rm->rm_cols);
 				rc = &rm->rm_col[c];
 				bcopy(rc->rc_data, orig[i], rc->rc_size);
 			}
 
 			/*
 			 * Attempt a reconstruction and exit the outer loop on
 			 * success.
 			 */
 			code = vdev_raidz_reconstruct(rm, tgts, n);
 			if (raidz_checksum_verify(zio) == 0) {
 				atomic_inc_64(&raidz_corrected[code]);
 
 				for (i = 0; i < n; i++) {
 					c = tgts[i];
 					rc = &rm->rm_col[c];
 					ASSERT(rc->rc_error == 0);
 					if (rc->rc_tried)
 						raidz_checksum_error(zio, rc,
 						    orig[i]);
 					rc->rc_error = SET_ERROR(ECKSUM);
 				}
 
 				ret = code;
 				goto done;
 			}
 
 			/*
 			 * Restore the original data.
 			 */
 			for (i = 0; i < n; i++) {
 				c = tgts[i];
 				rc = &rm->rm_col[c];
 				bcopy(orig[i], rc->rc_data, rc->rc_size);
 			}
 
 			do {
 				/*
 				 * Find the next valid column after the current
 				 * position..
 				 */
 				for (next = tgts[current] + 1;
 				    next < rm->rm_cols &&
 				    rm->rm_col[next].rc_error != 0; next++)
 					continue;
 
 				ASSERT(next <= tgts[current + 1]);
 
 				/*
 				 * If that spot is available, we're done here.
 				 */
 				if (next != tgts[current + 1])
 					break;
 
 				/*
 				 * Otherwise, find the next valid column after
 				 * the previous position.
 				 */
 				for (c = tgts[current - 1] + 1;
 				    rm->rm_col[c].rc_error != 0; c++)
 					continue;
 
 				tgts[current] = c;
 				current++;
 
 			} while (current != n);
 		}
 	}
 	n--;
 done:
 	for (i = 0; i < n; i++) {
 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
 	}
 
 	return (ret);
 }
 
 /*
  * Complete an IO operation on a RAIDZ VDev
  *
  * Outline:
  * - For write operations:
  *   1. Check for errors on the child IOs.
  *   2. Return, setting an error code if too few child VDevs were written
  *      to reconstruct the data later.  Note that partial writes are
  *      considered successful if they can be reconstructed at all.
  * - For read operations:
  *   1. Check for errors on the child IOs.
  *   2. If data errors occurred:
  *      a. Try to reassemble the data from the parity available.
  *      b. If we haven't yet read the parity drives, read them now.
  *      c. If all parity drives have been read but the data still doesn't
  *         reassemble with a correct checksum, then try combinatorial
  *         reconstruction.
  *      d. If that doesn't work, return an error.
  *   3. If there were unexpected errors or this is a resilver operation,
  *      rewrite the vdevs that had errors.
  */
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *cvd;
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
 	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
 	int n, c;
 	int tgts[VDEV_RAIDZ_MAXPARITY];
 	int code;
 
 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
 
 	for (c = 0; c < rm->rm_cols; c++) {
 		rc = &rm->rm_col[c];
 
 		if (rc->rc_error) {
 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
 
 			if (c < rm->rm_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			if (!rc->rc_skipped)
 				unexpected_errors++;
 
 			total_errors++;
 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * XXX -- for now, treat partial writes as a success.
 		 * (If we couldn't write enough columns to reconstruct
 		 * the data, the I/O failed.  Otherwise, good enough.)
 		 *
 		 * Now that we support write reallocation, it would be better
 		 * to treat partial failure as real failure unless there are
 		 * no non-degraded top-level vdevs left, and not update DTLs
 		 * if we intend to reallocate.
 		 */
 		/* XXPOLICY */
 		if (total_errors > rm->rm_firstdatacol)
 			zio->io_error = vdev_raidz_worst_error(rm);
 
 		return;
 	} else if (zio->io_type == ZIO_TYPE_FREE) {
 		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 	/*
 	 * There are three potential phases for a read:
 	 *	1. produce valid data from the columns read
 	 *	2. read all disks and try again
 	 *	3. perform combinatorial reconstruction
 	 *
 	 * Each phase is progressively both more expensive and less likely to
 	 * occur. If we encounter more errors than we can repair or all phases
 	 * fail, we have no choice but to return an error.
 	 */
 
 	/*
 	 * If the number of errors we saw was correctable -- less than or equal
 	 * to the number of parity disks read -- attempt to produce data that
 	 * has a valid checksum. Naturally, this case applies in the absence of
 	 * any errors.
 	 */
 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
 		if (data_errors == 0) {
 			if (raidz_checksum_verify(zio) == 0) {
 				/*
 				 * If we read parity information (unnecessarily
 				 * as it happens since no reconstruction was
 				 * needed) regenerate and verify the parity.
 				 * We also regenerate parity when resilvering
 				 * so we can write it out to the failed device
 				 * later.
 				 */
 				if (parity_errors + parity_untried <
 				    rm->rm_firstdatacol ||
 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 					n = raidz_parity_verify(zio, rm);
 					unexpected_errors += n;
 					ASSERT(parity_errors + n <=
 					    rm->rm_firstdatacol);
 				}
 				goto done;
 			}
 		} else {
 			/*
 			 * We either attempt to read all the parity columns or
 			 * none of them. If we didn't try to read parity, we
 			 * wouldn't be here in the correctable case. There must
 			 * also have been fewer parity errors than parity
 			 * columns or, again, we wouldn't be in this code path.
 			 */
 			ASSERT(parity_untried == 0);
 			ASSERT(parity_errors < rm->rm_firstdatacol);
 
 			/*
 			 * Identify the data columns that reported an error.
 			 */
 			n = 0;
 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 				rc = &rm->rm_col[c];
 				if (rc->rc_error != 0) {
 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
 					tgts[n++] = c;
 				}
 			}
 
 			ASSERT(rm->rm_firstdatacol >= n);
 
 			code = vdev_raidz_reconstruct(rm, tgts, n);
 
 			if (raidz_checksum_verify(zio) == 0) {
 				atomic_inc_64(&raidz_corrected[code]);
 
 				/*
 				 * If we read more parity disks than were used
 				 * for reconstruction, confirm that the other
 				 * parity disks produced correct data. This
 				 * routine is suboptimal in that it regenerates
 				 * the parity that we already used in addition
 				 * to the parity that we're attempting to
 				 * verify, but this should be a relatively
 				 * uncommon case, and can be optimized if it
 				 * becomes a problem. Note that we regenerate
 				 * parity when resilvering so we can write it
 				 * out to failed devices later.
 				 */
 				if (parity_errors < rm->rm_firstdatacol - n ||
 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 					n = raidz_parity_verify(zio, rm);
 					unexpected_errors += n;
 					ASSERT(parity_errors + n <=
 					    rm->rm_firstdatacol);
 				}
 
 				goto done;
 			}
 		}
 	}
 
 	/*
 	 * This isn't a typical situation -- either we got a read error or
 	 * a child silently returned bad data. Read every block so we can
 	 * try again with as much data and parity as we can track down. If
 	 * we've already been through once before, all children will be marked
 	 * as tried so we'll proceed to combinatorial reconstruction.
 	 */
 	unexpected_errors = 1;
 	rm->rm_missingdata = 0;
 	rm->rm_missingparity = 0;
 
 	for (c = 0; c < rm->rm_cols; c++) {
 		if (rm->rm_col[c].rc_tried)
 			continue;
 
 		zio_vdev_io_redone(zio);
 		do {
 			rc = &rm->rm_col[c];
 			if (rc->rc_tried)
 				continue;
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    vd->vdev_child[rc->rc_devidx],
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		} while (++c < rm->rm_cols);
 
 		return;
 	}
 
 	/*
 	 * At this point we've attempted to reconstruct the data given the
 	 * errors we detected, and we've attempted to read all columns. There
 	 * must, therefore, be one or more additional problems -- silent errors
 	 * resulting in invalid data rather than explicit I/O errors resulting
 	 * in absent data. We check if there is enough additional data to
 	 * possibly reconstruct the data and then perform combinatorial
 	 * reconstruction over all possible combinations. If that fails,
 	 * we're cooked.
 	 */
 	if (total_errors > rm->rm_firstdatacol) {
 		zio->io_error = vdev_raidz_worst_error(rm);
 
 	} else if (total_errors < rm->rm_firstdatacol &&
 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
 		/*
 		 * If we didn't use all the available parity for the
 		 * combinatorial reconstruction, verify that the remaining
 		 * parity is correct.
 		 */
 		if (code != (1 << rm->rm_firstdatacol) - 1)
 			(void) raidz_parity_verify(zio, rm);
 	} else {
 		/*
 		 * We're here because either:
 		 *
 		 *	total_errors == rm_first_datacol, or
 		 *	vdev_raidz_combrec() failed
 		 *
 		 * In either case, there is enough bad data to prevent
 		 * reconstruction.
 		 *
 		 * Start checksum ereports for all children which haven't
 		 * failed, and the IO wasn't speculative.
 		 */
 		zio->io_error = SET_ERROR(ECKSUM);
 
 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			for (c = 0; c < rm->rm_cols; c++) {
 				rc = &rm->rm_col[c];
 				if (rc->rc_error == 0) {
 					zio_bad_cksum_t zbc;
 					zbc.zbc_has_cksum = 0;
 					zbc.zbc_injected =
 					    rm->rm_ecksuminjected;
 
 					zfs_ereport_start_checksum(
 					    zio->io_spa,
 					    vd->vdev_child[rc->rc_devidx],
 					    zio, rc->rc_offset, rc->rc_size,
 					    (void *)(uintptr_t)c, &zbc);
 				}
 			}
 		}
 	}
 
 done:
 	zio_checksum_verified(zio);
 
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
 
 			if (rc->rc_error == 0)
 				continue;
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }
 
 static void
 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted > vd->vdev_nparity)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 vdev_ops_t vdev_raidz_ops = {
 	vdev_raidz_open,
 	vdev_raidz_close,
 	vdev_raidz_asize,
 	vdev_raidz_io_start,
 	vdev_raidz_io_done,
 	vdev_raidz_state_change,
 	NULL,
 	NULL,
 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	(revision 296519)
@@ -1,1469 +1,1470 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zap.h>
 #include <sys/refcount.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
 #include <sys/arc.h>
 #include <sys/dmu_objset.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
 #endif
 
 extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
 
 static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
 
 uint64_t
 zap_getflags(zap_t *zap)
 {
 	if (zap->zap_ismicro)
 		return (0);
 	return (zap_f_phys(zap)->zap_flags);
 }
 
 int
 zap_hashbits(zap_t *zap)
 {
 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
 		return (48);
 	else
 		return (28);
 }
 
 uint32_t
 zap_maxcd(zap_t *zap)
 {
 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
 		return ((1<<16)-1);
 	else
 		return (-1U);
 }
 
 static uint64_t
 zap_hash(zap_name_t *zn)
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t h = 0;
 
 	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
 		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
 		h = *(uint64_t *)zn->zn_key_orig;
 	} else {
 		h = zap->zap_salt;
 		ASSERT(h != 0);
 		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 
 		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
 			int i;
 			const uint64_t *wp = zn->zn_key_norm;
 
 			ASSERT(zn->zn_key_intlen == 8);
 			for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
 				int j;
 				uint64_t word = *wp;
 
 				for (j = 0; j < zn->zn_key_intlen; j++) {
 					h = (h >> 8) ^
 					    zfs_crc64_table[(h ^ word) & 0xFF];
 					word >>= NBBY;
 				}
 			}
 		} else {
 			int i, len;
 			const uint8_t *cp = zn->zn_key_norm;
 
 			/*
 			 * We previously stored the terminating null on
 			 * disk, but didn't hash it, so we need to
 			 * continue to not hash it.  (The
 			 * zn_key_*_numints includes the terminating
 			 * null for non-binary keys.)
 			 */
 			len = zn->zn_key_norm_numints - 1;
 
 			ASSERT(zn->zn_key_intlen == 1);
 			for (i = 0; i < len; cp++, i++) {
 				h = (h >> 8) ^
 				    zfs_crc64_table[(h ^ *cp) & 0xFF];
 			}
 		}
 	}
 	/*
 	 * Don't use all 64 bits, since we need some in the cookie for
 	 * the collision differentiator.  We MUST use the high bits,
 	 * since those are the ones that we first pay attention to when
 	 * chosing the bucket.
 	 */
 	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
 
 	return (h);
 }
 
 static int
 zap_normalize(zap_t *zap, const char *name, char *namenorm)
 {
 	size_t inlen, outlen;
 	int err;
 
 	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
 
 	inlen = strlen(name) + 1;
 	outlen = ZAP_MAXNAMELEN;
 
 	err = 0;
 	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
 	    zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
 	    U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
 
 	return (err);
 }
 
 boolean_t
 zap_match(zap_name_t *zn, const char *matchname)
 {
 	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
 
 	if (zn->zn_matchtype == MT_FIRST) {
 		char norm[ZAP_MAXNAMELEN];
 
 		if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
 			return (B_FALSE);
 
 		return (strcmp(zn->zn_key_norm, norm) == 0);
 	} else {
 		/* MT_BEST or MT_EXACT */
 		return (strcmp(zn->zn_key_orig, matchname) == 0);
 	}
 }
 
 void
 zap_name_free(zap_name_t *zn)
 {
 	kmem_free(zn, sizeof (zap_name_t));
 }
 
 zap_name_t *
 zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
 {
 	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
 
 	zn->zn_zap = zap;
 	zn->zn_key_intlen = sizeof (*key);
 	zn->zn_key_orig = key;
 	zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
 	zn->zn_matchtype = mt;
 	if (zap->zap_normflags) {
 		if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
 			zap_name_free(zn);
 			return (NULL);
 		}
 		zn->zn_key_norm = zn->zn_normbuf;
 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
 	} else {
 		if (mt != MT_EXACT) {
 			zap_name_free(zn);
 			return (NULL);
 		}
 		zn->zn_key_norm = zn->zn_key_orig;
 		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
 	}
 
 	zn->zn_hash = zap_hash(zn);
 	return (zn);
 }
 
 zap_name_t *
 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
 {
 	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
 
 	ASSERT(zap->zap_normflags == 0);
 	zn->zn_zap = zap;
 	zn->zn_key_intlen = sizeof (*key);
 	zn->zn_key_orig = zn->zn_key_norm = key;
 	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
 	zn->zn_matchtype = MT_EXACT;
 
 	zn->zn_hash = zap_hash(zn);
 	return (zn);
 }
 
 static void
 mzap_byteswap(mzap_phys_t *buf, size_t size)
 {
 	int i, max;
 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
 	buf->mz_salt = BSWAP_64(buf->mz_salt);
 	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
 	max = (size / MZAP_ENT_LEN) - 1;
 	for (i = 0; i < max; i++) {
 		buf->mz_chunk[i].mze_value =
 		    BSWAP_64(buf->mz_chunk[i].mze_value);
 		buf->mz_chunk[i].mze_cd =
 		    BSWAP_32(buf->mz_chunk[i].mze_cd);
 	}
 }
 
 void
 zap_byteswap(void *buf, size_t size)
 {
 	uint64_t block_type;
 
 	block_type = *(uint64_t *)buf;
 
 	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
 		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
 		mzap_byteswap(buf, size);
 	} else {
 		fzap_byteswap(buf, size);
 	}
 }
 
 static int
 mze_compare(const void *arg1, const void *arg2)
 {
 	const mzap_ent_t *mze1 = arg1;
 	const mzap_ent_t *mze2 = arg2;
 
 	if (mze1->mze_hash > mze2->mze_hash)
 		return (+1);
 	if (mze1->mze_hash < mze2->mze_hash)
 		return (-1);
 	if (mze1->mze_cd > mze2->mze_cd)
 		return (+1);
 	if (mze1->mze_cd < mze2->mze_cd)
 		return (-1);
 	return (0);
 }
 
 static int
 mze_insert(zap_t *zap, int chunkid, uint64_t hash)
 {
 	mzap_ent_t *mze;
 	avl_index_t idx;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
 	mze->mze_chunkid = chunkid;
 	mze->mze_hash = hash;
 	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
 	ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
 	if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) {
 		kmem_free(mze, sizeof (mzap_ent_t));
 		return (EEXIST);
 	}
 	avl_insert(&zap->zap_m.zap_avl, mze, idx);
 	return (0);
 }
 
 static mzap_ent_t *
 mze_find(zap_name_t *zn)
 {
 	mzap_ent_t mze_tofind;
 	mzap_ent_t *mze;
 	avl_index_t idx;
 	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
 
 	ASSERT(zn->zn_zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
 	mze_tofind.mze_hash = zn->zn_hash;
 	mze_tofind.mze_cd = 0;
 
 again:
 	mze = avl_find(avl, &mze_tofind, &idx);
 	if (mze == NULL)
 		mze = avl_nearest(avl, idx, AVL_AFTER);
 	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
 		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
 		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
 			return (mze);
 	}
 	if (zn->zn_matchtype == MT_BEST) {
 		zn->zn_matchtype = MT_FIRST;
 		goto again;
 	}
 	return (NULL);
 }
 
 static uint32_t
 mze_find_unused_cd(zap_t *zap, uint64_t hash)
 {
 	mzap_ent_t mze_tofind;
 	mzap_ent_t *mze;
 	avl_index_t idx;
 	avl_tree_t *avl = &zap->zap_m.zap_avl;
 	uint32_t cd;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	mze_tofind.mze_hash = hash;
 	mze_tofind.mze_cd = 0;
 
 	cd = 0;
 	for (mze = avl_find(avl, &mze_tofind, &idx);
 	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
 		if (mze->mze_cd != cd)
 			break;
 		cd++;
 	}
 
 	return (cd);
 }
 
 static void
 mze_remove(zap_t *zap, mzap_ent_t *mze)
 {
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	avl_remove(&zap->zap_m.zap_avl, mze);
 	kmem_free(mze, sizeof (mzap_ent_t));
 }
 
 static void
 mze_destroy(zap_t *zap)
 {
 	mzap_ent_t *mze;
 	void *avlcookie = NULL;
 
 	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
 		kmem_free(mze, sizeof (mzap_ent_t));
 	avl_destroy(&zap->zap_m.zap_avl);
 }
 
 static zap_t *
 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 {
 	zap_t *winner;
 	zap_t *zap;
 	int i;
 
 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
 
 	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 	rw_init(&zap->zap_rwlock, 0, 0, 0);
 	rw_enter(&zap->zap_rwlock, RW_WRITER);
 	zap->zap_objset = os;
 	zap->zap_object = obj;
 	zap->zap_dbuf = db;
 
 	if (*(uint64_t *)db->db_data != ZBT_MICRO) {
 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
 		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
 	} else {
 		zap->zap_ismicro = TRUE;
 	}
 
 	/*
 	 * Make sure that zap_ismicro is set before we let others see
 	 * it, because zap_lockdir() checks zap_ismicro without the lock
 	 * held.
 	 */
 	dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf);
 	winner = dmu_buf_set_user(db, &zap->zap_dbu);
 
 	if (winner != NULL) {
 		rw_exit(&zap->zap_rwlock);
 		rw_destroy(&zap->zap_rwlock);
 		if (!zap->zap_ismicro)
 			mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 		kmem_free(zap, sizeof (zap_t));
 		return (winner);
 	}
 
 	if (zap->zap_ismicro) {
 		zap->zap_salt = zap_m_phys(zap)->mz_salt;
 		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
 		avl_create(&zap->zap_m.zap_avl, mze_compare,
 		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
 
 		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 			mzap_ent_phys_t *mze =
 			    &zap_m_phys(zap)->mz_chunk[i];
 			if (mze->mze_name[0]) {
 				zap_name_t *zn;
 
 				zn = zap_name_alloc(zap, mze->mze_name,
 				    MT_EXACT);
 				if (mze_insert(zap, i, zn->zn_hash) == 0)
 					zap->zap_m.zap_num_entries++;
 				else {
 					printf("ZFS WARNING: Duplicated ZAP "
 					    "entry detected (%s).\n",
 					    mze->mze_name);
 				}
 				zap_name_free(zn);
 			}
 		}
 	} else {
 		zap->zap_salt = zap_f_phys(zap)->zap_salt;
 		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
 
 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
 		    2*ZAP_LEAF_CHUNKSIZE);
 
 		/*
 		 * The embedded pointer table should not overlap the
 		 * other members.
 		 */
 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
 		    &zap_f_phys(zap)->zap_salt);
 
 		/*
 		 * The embedded pointer table should end at the end of
 		 * the block
 		 */
 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
 		    (uintptr_t)zap_f_phys(zap), ==,
 		    zap->zap_dbuf->db_size);
 	}
 	rw_exit(&zap->zap_rwlock);
 	return (zap);
 }
 
 int
 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	zap_t *zap;
 	dmu_buf_t *db;
 	krw_t lt;
 	int err;
 
 	*zapp = NULL;
 
 	err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
 	if (err)
 		return (err);
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(db, &doi);
 		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 	}
 #endif
 
 	zap = dmu_buf_get_user(db);
 	if (zap == NULL)
 		zap = mzap_open(os, obj, db);
 
 	/*
 	 * We're checking zap_ismicro without the lock held, in order to
 	 * tell what type of lock we want.  Once we have some sort of
 	 * lock, see if it really is the right type.  In practice this
 	 * can only be different if it was upgraded from micro to fat,
 	 * and micro wanted WRITER but fat only needs READER.
 	 */
 	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
 	rw_enter(&zap->zap_rwlock, lt);
 	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
 		/* it was upgraded, now we only need reader */
 		ASSERT(lt == RW_WRITER);
 		ASSERT(RW_READER ==
 		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
 		rw_downgrade(&zap->zap_rwlock);
 		lt = RW_READER;
 	}
 
 	zap->zap_objset = os;
 
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
 
 	ASSERT3P(zap->zap_dbuf, ==, db);
 
 	ASSERT(!zap->zap_ismicro ||
 	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
 	if (zap->zap_ismicro && tx && adding &&
 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
 		if (newsz > MZAP_MAX_BLKSZ) {
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
 			return (mzap_upgrade(zapp, tx, 0));
 		}
 		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
 		ASSERT0(err);
 		zap->zap_m.zap_num_chunks =
 		    db->db_size / MZAP_ENT_LEN - 1;
 	}
 
 	*zapp = zap;
 	return (0);
 }
 
 void
 zap_unlockdir(zap_t *zap)
 {
 	rw_exit(&zap->zap_rwlock);
 	dmu_buf_rele(zap->zap_dbuf, NULL);
 }
 
 static int
 mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 {
 	mzap_phys_t *mzp;
 	int i, sz, nchunks;
 	int err = 0;
 	zap_t *zap = *zapp;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	sz = zap->zap_dbuf->db_size;
 	mzp = zio_buf_alloc(sz);
 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
 	nchunks = zap->zap_m.zap_num_chunks;
 
 	if (!flags) {
 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
 		    1ULL << fzap_default_block_shift, 0, tx);
 		if (err) {
 			zio_buf_free(mzp, sz);
 			return (err);
 		}
 	}
 
 	dprintf("upgrading obj=%llu with %u chunks\n",
 	    zap->zap_object, nchunks);
 	/* XXX destroy the avl later, so we can use the stored hash value */
 	mze_destroy(zap);
 
 	fzap_upgrade(zap, tx, flags);
 
 	for (i = 0; i < nchunks; i++) {
 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
 		zap_name_t *zn;
 		if (mze->mze_name[0] == 0)
 			continue;
 		dprintf("adding %s=%llu\n",
 		    mze->mze_name, mze->mze_value);
 		zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
 		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
 		zap_name_free(zn);
 		if (err)
 			break;
 	}
 	zio_buf_free(mzp, sz);
 	*zapp = zap;
 	return (err);
 }
 
 void
 mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
     dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	mzap_phys_t *zp;
 
 	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(db, &doi);
 		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 	}
 #endif
 
 	dmu_buf_will_dirty(db, tx);
 	zp = db->db_data;
 	zp->mz_block_type = ZBT_MICRO;
 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
 	zp->mz_normflags = normflags;
 	dmu_buf_rele(db, FTAG);
 
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
 		VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
 		    B_FALSE, B_FALSE, &zap));
 		VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
 		zap_unlockdir(zap);
 	}
 }
 
 int
 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (zap_create_claim_norm(os, obj,
 	    0, ot, bonustype, bonuslen, tx));
 }
 
 int
 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
     dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	int err;
 
 	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
 	if (err != 0)
 		return (err);
 	mzap_create_impl(os, obj, normflags, 0, tx);
 	return (0);
 }
 
 uint64_t
 zap_create(objset_t *os, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
 }
 
 uint64_t
 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
 	mzap_create_impl(os, obj, normflags, 0, tx);
 	return (obj);
 }
 
 uint64_t
 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
 	ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
 	    leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
 	    indirect_blockshift >= SPA_MINBLOCKSHIFT &&
 	    indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
 
 	VERIFY(dmu_object_set_blocksize(os, obj,
 	    1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
 
 	mzap_create_impl(os, obj, normflags, flags, tx);
 	return (obj);
 }
 
 int
 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
 {
 	/*
 	 * dmu_object_free will free the object number and free the
 	 * data.  Freeing the data will cause our pageout function to be
 	 * called, which will destroy our data (zap_leaf_t's and zap_t).
 	 */
 
 	return (dmu_object_free(os, zapobj, tx));
 }
 
 void
 zap_evict(void *dbu)
 {
 	zap_t *zap = dbu;
 
 	rw_destroy(&zap->zap_rwlock);
 
 	if (zap->zap_ismicro)
 		mze_destroy(zap);
 	else
 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 
 	kmem_free(zap, sizeof (zap_t));
 }
 
 int
 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 {
 	zap_t *zap;
 	int err;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	if (!zap->zap_ismicro) {
 		err = fzap_count(zap, count);
 	} else {
 		*count = zap->zap_m.zap_num_entries;
 	}
 	zap_unlockdir(zap);
 	return (err);
 }
 
 /*
  * zn may be NULL; if not specified, it will be computed if needed.
  * See also the comment above zap_entry_normalization_conflict().
  */
 static boolean_t
 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
 {
 	mzap_ent_t *other;
 	int direction = AVL_BEFORE;
 	boolean_t allocdzn = B_FALSE;
 
 	if (zap->zap_normflags == 0)
 		return (B_FALSE);
 
 again:
 	for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
 	    other && other->mze_hash == mze->mze_hash;
 	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
 
 		if (zn == NULL) {
 			zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
 			    MT_FIRST);
 			allocdzn = B_TRUE;
 		}
 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
 		}
 	}
 
 	if (direction == AVL_BEFORE) {
 		direction = AVL_AFTER;
 		goto again;
 	}
 
 	if (allocdzn)
 		zap_name_free(zn);
 	return (B_FALSE);
 }
 
 /*
  * Routines for manipulating attributes.
  */
 
 int
 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	return (zap_lookup_norm(os, zapobj, name, integer_size,
 	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
 }
 
 int
 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *ncp)
 {
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, mt);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	if (!zap->zap_ismicro) {
 		err = fzap_lookup(zn, integer_size, num_integers, buf,
 		    realname, rn_len, ncp);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			if (num_integers < 1) {
 				err = SET_ERROR(EOVERFLOW);
 			} else if (integer_size != 8) {
 				err = SET_ERROR(EINVAL);
 			} else {
 				*(uint64_t *)buf =
 				    MZE_PHYS(zap, mze)->mze_value;
 				(void) strlcpy(realname,
 				    MZE_PHYS(zap, mze)->mze_name, rn_len);
 				if (ncp) {
 					*ncp = mzap_normalization_conflict(zap,
 					    zn, mze);
 				}
 			}
 		}
 	}
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	fzap_prefetch(zn);
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	err = fzap_lookup(zn, integer_size, num_integers, buf,
 	    NULL, 0, NULL);
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
 {
 	int err = zap_lookup_norm(os, zapobj, name, 0,
 	    0, NULL, MT_EXACT, NULL, 0, NULL);
 	if (err == EOVERFLOW || err == EINVAL)
 		err = 0; /* found, but skipped reading the value */
 	return (err);
 }
 
 int
 zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_length(zn, integer_size, num_integers);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			if (integer_size)
 				*integer_size = 8;
 			if (num_integers)
 				*num_integers = 1;
 		}
 	}
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_length(zn, integer_size, num_integers);
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 static void
 mzap_addent(zap_name_t *zn, uint64_t value)
 {
 	int i;
 	zap_t *zap = zn->zn_zap;
 	int start = zap->zap_m.zap_alloc_next;
 	uint32_t cd;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 #ifdef ZFS_DEBUG
 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 	}
 #endif
 
 	cd = mze_find_unused_cd(zap, zn->zn_hash);
 	/* given the limited size of the microzap, this can't happen */
 	ASSERT(cd < zap_maxcd(zap));
 
 again:
 	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
 			mze->mze_cd = cd;
 			(void) strcpy(mze->mze_name, zn->zn_key_orig);
 			zap->zap_m.zap_num_entries++;
 			zap->zap_m.zap_alloc_next = i+1;
 			if (zap->zap_m.zap_alloc_next ==
 			    zap->zap_m.zap_num_chunks)
 				zap->zap_m.zap_alloc_next = 0;
 			VERIFY(0 == mze_insert(zap, i, zn->zn_hash));
 			return;
 		}
 	}
 	if (start != 0) {
 		start = 0;
 		goto again;
 	}
 	ASSERT(!"out of entries!");
 }
 
 int
 zap_add(objset_t *os, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
 	const uint64_t *intval = val;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, key, MT_EXACT);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_add(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(key) >= MZAP_NAME_LEN) {
 		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0)
 			err = fzap_add(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
 		mze = mze_find(zn);
 		if (mze != NULL) {
 			err = SET_ERROR(EEXIST);
 		} else {
 			mzap_addent(zn, *intval);
 		}
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
 		zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_add(zn, integer_size, num_integers, val, tx);
 	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
 		zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	mzap_ent_t *mze;
 	uint64_t oldval;
 	const uint64_t *intval = val;
 	zap_name_t *zn;
 	int err;
 
 #ifdef ZFS_DEBUG
 	/*
 	 * If there is an old value, it shouldn't change across the
 	 * lockdir (eg, due to bprewrite's xlation).
 	 */
 	if (integer_size == 8 && num_integers == 1)
 		(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
 #endif
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_update(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    zapobj, integer_size, num_integers, name);
 		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0)
 			err = fzap_update(zn, integer_size, num_integers,
 			    val, tx);
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
 		mze = mze_find(zn);
 		if (mze != NULL) {
 			ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
 			MZE_PHYS(zap, mze)->mze_value = *intval;
 		} else {
 			mzap_addent(zn, *intval);
 		}
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
 		zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	zap_name_t *zn;
 	int err;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_update(zn, integer_size, num_integers, val, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
 		zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
 {
 	return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
 }
 
 int
 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
     matchtype_t mt, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, mt);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_remove(zn, tx);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			zap->zap_m.zap_num_entries--;
 			bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
 			    sizeof (mzap_ent_phys_t));
 			mze_remove(zap, mze);
 		}
 	}
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_remove(zn, tx);
 	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 /*
  * Routines for iterating over the attributes.
  */
 
 void
 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
     uint64_t serialized)
 {
 	zc->zc_objset = os;
 	zc->zc_zap = NULL;
 	zc->zc_leaf = NULL;
 	zc->zc_zapobj = zapobj;
 	zc->zc_serialized = serialized;
 	zc->zc_hash = 0;
 	zc->zc_cd = 0;
 }
 
 void
 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
 	zap_cursor_init_serialized(zc, os, zapobj, 0);
 }
 
 void
 zap_cursor_fini(zap_cursor_t *zc)
 {
 	if (zc->zc_zap) {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 		zap_unlockdir(zc->zc_zap);
 		zc->zc_zap = NULL;
 	}
 	if (zc->zc_leaf) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 		zap_put_leaf(zc->zc_leaf);
 		zc->zc_leaf = NULL;
 	}
 	zc->zc_objset = NULL;
 }
 
 uint64_t
 zap_cursor_serialize(zap_cursor_t *zc)
 {
 	if (zc->zc_hash == -1ULL)
 		return (-1ULL);
 	if (zc->zc_zap == NULL)
 		return (zc->zc_serialized);
 	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
 	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
 
 	/*
 	 * We want to keep the high 32 bits of the cursor zero if we can, so
 	 * that 32-bit programs can access this.  So usually use a small
 	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
 	 * of the cursor.
 	 *
 	 * [ collision differentiator | zap_hashbits()-bit hash value ]
 	 */
 	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
 	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
 }
 
 int
 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 {
 	int err;
 	avl_index_t idx;
 	mzap_ent_t mze_tofind;
 	mzap_ent_t *mze;
 
 	if (zc->zc_hash == -1ULL)
 		return (SET_ERROR(ENOENT));
 
 	if (zc->zc_zap == NULL) {
 		int hb;
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
 		    RW_READER, TRUE, FALSE, &zc->zc_zap);
 		if (err)
 			return (err);
 
 		/*
 		 * To support zap_cursor_init_serialized, advance, retrieve,
 		 * we must add to the existing zc_cd, which may already
 		 * be 1 due to the zap_cursor_advance.
 		 */
 		ASSERT(zc->zc_hash == 0);
 		hb = zap_hashbits(zc->zc_zap);
 		zc->zc_hash = zc->zc_serialized << (64 - hb);
 		zc->zc_cd += zc->zc_serialized >> hb;
 		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
 			zc->zc_cd = 0;
 	} else {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 	}
 	if (!zc->zc_zap->zap_ismicro) {
 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
 	} else {
 		mze_tofind.mze_hash = zc->zc_hash;
 		mze_tofind.mze_cd = zc->zc_cd;
 
 		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
 		if (mze == NULL) {
 			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
 			    idx, AVL_AFTER);
 		}
 		if (mze) {
 			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
 			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
 			za->za_normalization_conflict =
 			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
 			za->za_integer_length = 8;
 			za->za_num_integers = 1;
 			za->za_first_integer = mzep->mze_value;
 			(void) strcpy(za->za_name, mzep->mze_name);
 			zc->zc_hash = mze->mze_hash;
 			zc->zc_cd = mze->mze_cd;
 			err = 0;
 		} else {
 			zc->zc_hash = -1ULL;
 			err = SET_ERROR(ENOENT);
 		}
 	}
 	rw_exit(&zc->zc_zap->zap_rwlock);
 	return (err);
 }
 
 void
 zap_cursor_advance(zap_cursor_t *zc)
 {
 	if (zc->zc_hash == -1ULL)
 		return;
 	zc->zc_cd++;
 }
 
 int
 zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
 {
 	int err = 0;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
 	if (zc->zc_zap == NULL) {
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
 		    RW_READER, TRUE, FALSE, &zc->zc_zap);
 		if (err)
 			return (err);
 	} else {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 	}
 
 	zn = zap_name_alloc(zc->zc_zap, name, mt);
 	if (zn == NULL) {
 		rw_exit(&zc->zc_zap->zap_rwlock);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	if (!zc->zc_zap->zap_ismicro) {
 		err = fzap_cursor_move_to_key(zc, zn);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 			goto out;
 		}
 		zc->zc_hash = mze->mze_hash;
 		zc->zc_cd = mze->mze_cd;
 	}
 
 out:
 	zap_name_free(zn);
 	rw_exit(&zc->zc_zap->zap_rwlock);
 	return (err);
 }
 
 int
 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 {
 	int err;
 	zap_t *zap;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 
 	bzero(zs, sizeof (zap_stats_t));
 
 	if (zap->zap_ismicro) {
 		zs->zs_blocksize = zap->zap_dbuf->db_size;
 		zs->zs_num_entries = zap->zap_m.zap_num_entries;
 		zs->zs_num_blocks = 1;
 	} else {
 		fzap_get_stats(zap, zs);
 	}
 	zap_unlockdir(zap);
 	return (0);
 }
 
 int
 zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
     uint64_t *towrite, uint64_t *tooverwrite)
 {
 	zap_t *zap;
 	int err = 0;
 
 	/*
 	 * Since, we don't have a name, we cannot figure out which blocks will
 	 * be affected in this operation. So, account for the worst case :
 	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
 	 * - 4 new blocks written if adding:
 	 * 	- 2 blocks for possibly split leaves,
 	 * 	- 2 grown ptrtbl blocks
 	 *
 	 * This also accomodates the case where an add operation to a fairly
 	 * large microzap results in a promotion to fatzap.
 	 */
 	if (name == NULL) {
 		*towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
 		return (err);
 	}
 
 	/*
 	 * We lock the zap with adding == FALSE. Because, if we pass
 	 * the actual value of add, it could trigger a mzap_upgrade().
 	 * At present we are just evaluating the possibility of this operation
 	 * and hence we donot want to trigger an upgrade.
 	 */
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 
 	if (!zap->zap_ismicro) {
 		zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT);
 		if (zn) {
 			err = fzap_count_write(zn, add, towrite,
 			    tooverwrite);
 			zap_name_free(zn);
 		} else {
 			/*
 			 * We treat this case as similar to (name == NULL)
 			 */
 			*towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
 		}
 	} else {
 		/*
 		 * We are here if (name != NULL) and this is a micro-zap.
 		 * We account for the header block depending on whether it
 		 * is freeable.
 		 *
 		 * Incase of an add-operation it is hard to find out
 		 * if this add will promote this microzap to fatzap.
 		 * Hence, we consider the worst case and account for the
 		 * blocks assuming this microzap would be promoted to a
 		 * fatzap.
 		 *
 		 * 1 block overwritten  : header block
 		 * 4 new blocks written : 2 new split leaf, 2 grown
 		 *			ptrtbl blocks
 		 */
 		if (dmu_buf_freeable(zap->zap_dbuf))
 			*tooverwrite += MZAP_MAX_BLKSZ;
 		else
 			*towrite += MZAP_MAX_BLKSZ;
 
 		if (add) {
 			*towrite += 4 * MZAP_MAX_BLKSZ;
 		}
 	}
 
 	zap_unlockdir(zap);
 	return (err);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	(revision 296519)
@@ -1,6763 +1,6764 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011-2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /*
  * ZFS ioctls.
  *
  * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
  * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
  *
  * There are two ways that we handle ioctls: the legacy way where almost
  * all of the logic is in the ioctl callback, and the new way where most
  * of the marshalling is handled in the common entry point, zfsdev_ioctl().
  *
  * Non-legacy ioctls should be registered by calling
  * zfs_ioctl_register() from zfs_ioctl_init().  The ioctl is invoked
  * from userland by lzc_ioctl().
  *
  * The registration arguments are as follows:
  *
  * const char *name
  *   The name of the ioctl.  This is used for history logging.  If the
  *   ioctl returns successfully (the callback returns 0), and allow_log
  *   is true, then a history log entry will be recorded with the input &
  *   output nvlists.  The log entry can be printed with "zpool history -i".
  *
  * zfs_ioc_t ioc
  *   The ioctl request number, which userland will pass to ioctl(2).
  *   The ioctl numbers can change from release to release, because
  *   the caller (libzfs) must be matched to the kernel.
  *
  * zfs_secpolicy_func_t *secpolicy
  *   This function will be called before the zfs_ioc_func_t, to
  *   determine if this operation is permitted.  It should return EPERM
  *   on failure, and 0 on success.  Checks include determining if the
  *   dataset is visible in this zone, and if the user has either all
  *   zfs privileges in the zone (SYS_MOUNT), or has been granted permission
  *   to do this operation on this dataset with "zfs allow".
  *
  * zfs_ioc_namecheck_t namecheck
  *   This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
  *   name, a dataset name, or nothing.  If the name is not well-formed,
  *   the ioctl will fail and the callback will not be called.
  *   Therefore, the callback can assume that the name is well-formed
  *   (e.g. is null-terminated, doesn't have more than one '@' character,
  *   doesn't have invalid characters).
  *
  * zfs_ioc_poolcheck_t pool_check
  *   This specifies requirements on the pool state.  If the pool does
  *   not meet them (is suspended or is readonly), the ioctl will fail
  *   and the callback will not be called.  If any checks are specified
  *   (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
  *   Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
  *   POOL_CHECK_READONLY).
  *
  * boolean_t smush_outnvlist
  *   If smush_outnvlist is true, then the output is presumed to be a
  *   list of errors, and it will be "smushed" down to fit into the
  *   caller's buffer, by removing some entries and replacing them with a
  *   single "N_MORE_ERRORS" entry indicating how many were removed.  See
  *   nvlist_smush() for details.  If smush_outnvlist is false, and the
  *   outnvlist does not fit into the userland-provided buffer, then the
  *   ioctl will fail with ENOMEM.
  *
  * zfs_ioc_func_t *func
  *   The callback function that will perform the operation.
  *
  *   The callback should return 0 on success, or an error number on
  *   failure.  If the function fails, the userland ioctl will return -1,
  *   and errno will be set to the callback's return value.  The callback
  *   will be called with the following arguments:
  *
  *   const char *name
  *     The name of the pool or dataset to operate on, from
  *     zfs_cmd_t:zc_name.  The 'namecheck' argument specifies the
  *     expected type (pool, dataset, or none).
  *
  *   nvlist_t *innvl
  *     The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src.  Or
  *     NULL if no input nvlist was provided.  Changes to this nvlist are
  *     ignored.  If the input nvlist could not be deserialized, the
  *     ioctl will fail and the callback will not be called.
  *
  *   nvlist_t *outnvl
  *     The output nvlist, initially empty.  The callback can fill it in,
  *     and it will be returned to userland by serializing it into
  *     zfs_cmd_t:zc_nvlist_dst.  If it is non-empty, and serialization
  *     fails (e.g. because the caller didn't supply a large enough
  *     buffer), then the overall ioctl will fail.  See the
  *     'smush_nvlist' argument above for additional behaviors.
  *
  *     There are two typical uses of the output nvlist:
  *       - To return state, e.g. property values.  In this case,
  *         smush_outnvlist should be false.  If the buffer was not large
  *         enough, the caller will reallocate a larger buffer and try
  *         the ioctl again.
  *
  *       - To return multiple errors from an ioctl which makes on-disk
  *         changes.  In this case, smush_outnvlist should be true.
  *         Ioctls which make on-disk modifications should generally not
  *         use the outnvl if they succeed, because the caller can not
  *         distinguish between the operation failing, and
  *         deserialization failing.
  */
 #ifdef __FreeBSD__
 #include "opt_kstack_pages.h"
 #endif
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/sunddi.h>
 #include <sys/policy.h>
 #include <sys/zone.h>
 #include <sys/nvpair.h>
 #include <sys/mount.h>
 #include <sys/taskqueue.h>
 #include <sys/sdt.h>
 #include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 #include <sys/zio_checksum.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "zfs_deleg.h"
 #include "zfs_comutil.h"
 #include "zfs_ioctl_compat.h"
 
 CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX);
 
 static struct cdev *zfsdev;
 
 extern void zfs_init(void);
 extern void zfs_fini(void);
 
 uint_t zfs_fsyncer_key;
 extern uint_t rrw_tsd_key;
 static uint_t zfs_allow_log_key;
 extern uint_t zfs_geom_probe_vdev_key;
 
 typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
 typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
 typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *);
 
 typedef enum {
 	NO_NAME,
 	POOL_NAME,
 	DATASET_NAME
 } zfs_ioc_namecheck_t;
 
 typedef enum {
 	POOL_CHECK_NONE		= 1 << 0,
 	POOL_CHECK_SUSPENDED	= 1 << 1,
 	POOL_CHECK_READONLY	= 1 << 2,
 } zfs_ioc_poolcheck_t;
 
 typedef struct zfs_ioc_vec {
 	zfs_ioc_legacy_func_t	*zvec_legacy_func;
 	zfs_ioc_func_t		*zvec_func;
 	zfs_secpolicy_func_t	*zvec_secpolicy;
 	zfs_ioc_namecheck_t	zvec_namecheck;
 	boolean_t		zvec_allow_log;
 	zfs_ioc_poolcheck_t	zvec_pool_check;
 	boolean_t		zvec_smush_outnvlist;
 	const char		*zvec_name;
 } zfs_ioc_vec_t;
 
 /* This array is indexed by zfs_userquota_prop_t */
 static const char *userquota_perms[] = {
 	ZFS_DELEG_PERM_USERUSED,
 	ZFS_DELEG_PERM_USERQUOTA,
 	ZFS_DELEG_PERM_GROUPUSED,
 	ZFS_DELEG_PERM_GROUPQUOTA,
 };
 
 static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
 static int zfs_check_settable(const char *name, nvpair_t *property,
     cred_t *cr);
 static int zfs_check_clearable(char *dataset, nvlist_t *props,
     nvlist_t **errors);
 static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
     boolean_t *);
 int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
 static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
  
 static void zfsdev_close(void *data);
 
 static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
 
 /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
 void
 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 {
 	const char *newfile;
 	char buf[512];
 	va_list adx;
 
 	/*
 	 * Get rid of annoying "../common/" prefix to filename.
 	 */
 	newfile = strrchr(file, '/');
 	if (newfile != NULL) {
 		newfile = newfile + 1; /* Get rid of leading / */
 	} else {
 		newfile = file;
 	}
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	/*
 	 * To get this data, use the zfs-dprintf probe as so:
 	 * dtrace -q -n 'zfs-dprintf \
 	 *	/stringof(arg0) == "dbuf.c"/ \
 	 *	{printf("%s: %s", stringof(arg1), stringof(arg3))}'
 	 * arg0 = file name
 	 * arg1 = function name
 	 * arg2 = line number
 	 * arg3 = message
 	 */
 	DTRACE_PROBE4(zfs__dprintf,
 	    char *, newfile, char *, func, int, line, char *, buf);
 }
 
 static void
 history_str_free(char *buf)
 {
 	kmem_free(buf, HIS_MAX_RECORD_LEN);
 }
 
 static char *
 history_str_get(zfs_cmd_t *zc)
 {
 	char *buf;
 
 	if (zc->zc_history == 0)
 		return (NULL);
 
 	buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
 	if (copyinstr((void *)(uintptr_t)zc->zc_history,
 	    buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
 		history_str_free(buf);
 		return (NULL);
 	}
 
 	buf[HIS_MAX_RECORD_LEN -1] = '\0';
 
 	return (buf);
 }
 
 /*
  * Check to see if the named dataset is currently defined as bootable
  */
 static boolean_t
 zfs_is_bootfs(const char *name)
 {
 	objset_t *os;
 
 	if (dmu_objset_hold(name, FTAG, &os) == 0) {
 		boolean_t ret;
 		ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
 		dmu_objset_rele(os, FTAG);
 		return (ret);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Return non-zero if the spa version is less than requested version.
  */
 static int
 zfs_earlier_version(const char *name, int version)
 {
 	spa_t *spa;
 
 	if (spa_open(name, &spa, FTAG) == 0) {
 		if (spa_version(spa) < version) {
 			spa_close(spa, FTAG);
 			return (1);
 		}
 		spa_close(spa, FTAG);
 	}
 	return (0);
 }
 
 /*
  * Return TRUE if the ZPL version is less than requested version.
  */
 static boolean_t
 zpl_earlier_version(const char *name, int version)
 {
 	objset_t *os;
 	boolean_t rc = B_TRUE;
 
 	if (dmu_objset_hold(name, FTAG, &os) == 0) {
 		uint64_t zplversion;
 
 		if (dmu_objset_type(os) != DMU_OST_ZFS) {
 			dmu_objset_rele(os, FTAG);
 			return (B_TRUE);
 		}
 		/* XXX reading from non-owned objset */
 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
 			rc = zplversion < version;
 		dmu_objset_rele(os, FTAG);
 	}
 	return (rc);
 }
 
 static void
 zfs_log_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *buf;
 
 	if ((buf = history_str_get(zc)) == NULL)
 		return;
 
 	if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
 			(void) spa_history_log(spa, buf);
 		spa_close(spa, FTAG);
 	}
 	history_str_free(buf);
 }
 
 /*
  * Policy for top-level read operations (list pools).  Requires no privileges,
  * and can be used in the local zone, as there is no associated dataset.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (0);
 }
 
 /*
  * Policy for dataset read operations (list children, get statistics).  Requires
  * no privileges, but must be visible in the local zone.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	if (INGLOBALZONE(curthread) ||
 	    zone_dataset_visible(zc->zc_name, NULL))
 		return (0);
 
 	return (SET_ERROR(ENOENT));
 }
 
 static int
 zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
 {
 	int writable = 1;
 
 	/*
 	 * The dataset must be visible by this zone -- check this first
 	 * so they don't see EPERM on something they shouldn't know about.
 	 */
 	if (!INGLOBALZONE(curthread) &&
 	    !zone_dataset_visible(dataset, &writable))
 		return (SET_ERROR(ENOENT));
 
 	if (INGLOBALZONE(curthread)) {
 		/*
 		 * If the fs is zoned, only root can access it from the
 		 * global zone.
 		 */
 		if (secpolicy_zfs(cr) && zoned)
 			return (SET_ERROR(EPERM));
 	} else {
 		/*
 		 * If we are in a local zone, the 'zoned' property must be set.
 		 */
 		if (!zoned)
 			return (SET_ERROR(EPERM));
 
 		/* must be writable by this zone */
 		if (!writable)
 			return (SET_ERROR(EPERM));
 	}
 	return (0);
 }
 
 static int
 zfs_dozonecheck(const char *dataset, cred_t *cr)
 {
 	uint64_t zoned;
 
 	if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
 		return (SET_ERROR(ENOENT));
 
 	return (zfs_dozonecheck_impl(dataset, zoned, cr));
 }
 
 static int
 zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
 {
 	uint64_t zoned;
 
 	if (dsl_prop_get_int_ds(ds, "jailed", &zoned))
 		return (SET_ERROR(ENOENT));
 
 	return (zfs_dozonecheck_impl(dataset, zoned, cr));
 }
 
 static int
 zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
     const char *perm, cred_t *cr)
 {
 	int error;
 
 	error = zfs_dozonecheck_ds(name, ds, cr);
 	if (error == 0) {
 		error = secpolicy_zfs(cr);
 		if (error != 0)
 			error = dsl_deleg_access_impl(ds, perm, cr);
 	}
 	return (error);
 }
 
 static int
 zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
 {
 	int error;
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp;
 
 	error = dsl_pool_hold(name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, name, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
 
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 #ifdef SECLABEL
 /*
  * Policy for setting the security label property.
  *
  * Returns 0 for success, non-zero for access and other errors.
  */
 static int
 zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
 {
 	char		ds_hexsl[MAXNAMELEN];
 	bslabel_t	ds_sl, new_sl;
 	boolean_t	new_default = FALSE;
 	uint64_t	zoned;
 	int		needed_priv = -1;
 	int		error;
 
 	/* First get the existing dataset label. */
 	error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
 	if (error != 0)
 		return (SET_ERROR(EPERM));
 
 	if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
 		new_default = TRUE;
 
 	/* The label must be translatable */
 	if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * In a non-global zone, disallow attempts to set a label that
 	 * doesn't match that of the zone; otherwise no other checks
 	 * are needed.
 	 */
 	if (!INGLOBALZONE(curproc)) {
 		if (new_default || !blequal(&new_sl, CR_SL(CRED())))
 			return (SET_ERROR(EPERM));
 		return (0);
 	}
 
 	/*
 	 * For global-zone datasets (i.e., those whose zoned property is
 	 * "off", verify that the specified new label is valid for the
 	 * global zone.
 	 */
 	if (dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
 		return (SET_ERROR(EPERM));
 	if (!zoned) {
 		if (zfs_check_global_label(name, strval) != 0)
 			return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * If the existing dataset label is nondefault, check if the
 	 * dataset is mounted (label cannot be changed while mounted).
 	 * Get the zfsvfs; if there isn't one, then the dataset isn't
 	 * mounted (or isn't a dataset, doesn't exist, ...).
 	 */
 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
 		objset_t *os;
 		static char *setsl_tag = "setsl_tag";
 
 		/*
 		 * Try to own the dataset; abort if there is any error,
 		 * (e.g., already mounted, in use, or other error).
 		 */
 		error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
 		    setsl_tag, &os);
 		if (error != 0)
 			return (SET_ERROR(EPERM));
 
 		dmu_objset_disown(os, setsl_tag);
 
 		if (new_default) {
 			needed_priv = PRIV_FILE_DOWNGRADE_SL;
 			goto out_check;
 		}
 
 		if (hexstr_to_label(strval, &new_sl) != 0)
 			return (SET_ERROR(EPERM));
 
 		if (blstrictdom(&ds_sl, &new_sl))
 			needed_priv = PRIV_FILE_DOWNGRADE_SL;
 		else if (blstrictdom(&new_sl, &ds_sl))
 			needed_priv = PRIV_FILE_UPGRADE_SL;
 	} else {
 		/* dataset currently has a default label */
 		if (!new_default)
 			needed_priv = PRIV_FILE_UPGRADE_SL;
 	}
 
 out_check:
 	if (needed_priv != -1)
 		return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
 	return (0);
 }
 #endif	/* SECLABEL */
 
 static int
 zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
     cred_t *cr)
 {
 	char *strval;
 
 	/*
 	 * Check permissions for special properties.
 	 */
 	switch (prop) {
 	case ZFS_PROP_ZONED:
 		/*
 		 * Disallow setting of 'zoned' from within a local zone.
 		 */
 		if (!INGLOBALZONE(curthread))
 			return (SET_ERROR(EPERM));
 		break;
 
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 		if (!INGLOBALZONE(curthread)) {
 			uint64_t zoned;
 			char setpoint[MAXNAMELEN];
 			/*
 			 * Unprivileged users are allowed to modify the
 			 * limit on things *under* (ie. contained by)
 			 * the thing they own.
 			 */
 			if (dsl_prop_get_integer(dsname, "jailed", &zoned,
 			    setpoint))
 				return (SET_ERROR(EPERM));
 			if (!zoned || strlen(dsname) <= strlen(setpoint))
 				return (SET_ERROR(EPERM));
 		}
 		break;
 
 	case ZFS_PROP_MLSLABEL:
 #ifdef SECLABEL
 		if (!is_system_labeled())
 			return (SET_ERROR(EPERM));
 
 		if (nvpair_value_string(propval, &strval) == 0) {
 			int err;
 
 			err = zfs_set_slabel_policy(dsname, strval, CRED());
 			if (err != 0)
 				return (err);
 		}
 #else
 		return (EOPNOTSUPP);
 #endif
 		break;
 	}
 
 	return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
 	error = zfs_dozonecheck(zc->zc_name, cr);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * permission to set permissions will be evaluated later in
 	 * dsl_deleg_can_allow()
 	 */
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_ROLLBACK, cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	char *cp;
 	int error;
 
 	/*
 	 * Generate the current snapshot name from the given objsetid, then
 	 * use that name for the secpolicy/zone checks.
 	 */
 	cp = strchr(zc->zc_name, '@');
 	if (cp == NULL)
 		return (SET_ERROR(EINVAL));
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_name(ds, zc->zc_name);
 
 	error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
 	    ZFS_DELEG_PERM_SEND, cr);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_SEND, cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	vnode_t *vp;
 	int error;
 
 	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
 	    NO_FOLLOW, NULL, &vp)) != 0)
 		return (error);
 
 	/* Now make sure mntpnt and dataset are ZFS */
 
 	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
 	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
 	    zc->zc_name) != 0)) {
 		VN_RELE(vp);
 		return (SET_ERROR(EPERM));
 	}
 
 	VN_RELE(vp);
 	return (dsl_deleg_access(zc->zc_name,
 	    ZFS_DELEG_PERM_SHARE, cr));
 }
 
 int
 zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	if (!INGLOBALZONE(curthread))
 		return (SET_ERROR(EPERM));
 
 	if (secpolicy_nfs(cr) == 0) {
 		return (0);
 	} else {
 		return (zfs_secpolicy_deleg_share(zc, innvl, cr));
 	}
 }
 
 int
 zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	if (!INGLOBALZONE(curthread))
 		return (SET_ERROR(EPERM));
 
 	if (secpolicy_smb(cr) == 0) {
 		return (0);
 	} else {
 		return (zfs_secpolicy_deleg_share(zc, innvl, cr));
 	}
 }
 
 static int
 zfs_get_parent(const char *datasetname, char *parent, int parentsize)
 {
 	char *cp;
 
 	/*
 	 * Remove the @bla or /bla from the end of the name to get the parent.
 	 */
 	(void) strncpy(parent, datasetname, parentsize);
 	cp = strrchr(parent, '@');
 	if (cp != NULL) {
 		cp[0] = '\0';
 	} else {
 		cp = strrchr(parent, '/');
 		if (cp == NULL)
 			return (SET_ERROR(ENOENT));
 		cp[0] = '\0';
 	}
 
 	return (0);
 }
 
 int
 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
 {
 	int error;
 
 	if ((error = zfs_secpolicy_write_perms(name,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
 }
 
 /*
  * Destroying snapshots with delegated permissions requires
  * descendant mount and destroy permissions.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvlist_t *snaps;
 	nvpair_t *pair, *nextpair;
 	int error = 0;
 
 	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
 		return (SET_ERROR(EINVAL));
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nextpair) {
 		nextpair = nvlist_next_nvpair(snaps, pair);
 		error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
 		if (error == ENOENT) {
 			/*
 			 * Ignore any snapshots that don't exist (we consider
 			 * them "already destroyed").  Remove the name from the
 			 * nvl here in case the snapshot is created between
 			 * now and when we try to destroy it (in which case
 			 * we don't want to destroy it since we haven't
 			 * checked for permission).
 			 */
 			fnvlist_remove_nvpair(snaps, pair);
 			error = 0;
 		}
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 int
 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 {
 	char	parentname[MAXNAMELEN];
 	int	error;
 
 	if ((error = zfs_secpolicy_write_perms(from,
 	    ZFS_DELEG_PERM_RENAME, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(from,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_get_parent(to, parentname,
 	    sizeof (parentname))) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	char *at = NULL;
 	int error;
 
 	if ((zc->zc_cookie & 1) != 0) {
 		/*
 		 * This is recursive rename, so the starting snapshot might
 		 * not exist. Check file system or volume permission instead.
 		 */
 		at = strchr(zc->zc_name, '@');
 		if (at == NULL)
 			return (EINVAL);
 		*at = '\0';
 	}
 
 	error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr);
 
 	if (at != NULL)
 		*at = '@';
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *clone;
 	int error;
 
 	error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_PROMOTE, cr);
 	if (error != 0)
 		return (error);
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
 
 	if (error == 0) {
 		char parentname[MAXNAMELEN];
 		dsl_dataset_t *origin = NULL;
 		dsl_dir_t *dd;
 		dd = clone->ds_dir;
 
 		error = dsl_dataset_hold_obj(dd->dd_pool,
 		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
 		if (error != 0) {
 			dsl_dataset_rele(clone, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
 		error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
 		    ZFS_DELEG_PERM_MOUNT, cr);
 
 		dsl_dataset_name(origin, parentname);
 		if (error == 0) {
 			error = zfs_secpolicy_write_perms_ds(parentname, origin,
 			    ZFS_DELEG_PERM_PROMOTE, cr);
 		}
 		dsl_dataset_rele(clone, FTAG);
 		dsl_dataset_rele(origin, FTAG);
 	}
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_CREATE, cr));
 }
 
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(name,
 	    ZFS_DELEG_PERM_SNAPSHOT, cr));
 }
 
 /*
  * Check for permission to create each snapshot in the nvlist.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvlist_t *snaps;
 	int error;
 	nvpair_t *pair;
 
 	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
 		return (SET_ERROR(EINVAL));
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		char *name = nvpair_name(pair);
 		char *atp = strchr(name, '@');
 
 		if (atp == NULL) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		*atp = '\0';
 		error = zfs_secpolicy_snapshot_perms(name, cr);
 		*atp = '@';
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Check for permission to create each snapshot in the nvlist.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error = 0;
 
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
 		char *name = nvpair_name(pair);
 		char *hashp = strchr(name, '#');
 
 		if (hashp == NULL) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		*hashp = '\0';
 		error = zfs_secpolicy_write_perms(name,
 		    ZFS_DELEG_PERM_BOOKMARK, cr);
 		*hashp = '#';
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvpair_t *pair, *nextpair;
 	int error = 0;
 
 	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
 	    pair = nextpair) {
 		char *name = nvpair_name(pair);
 		char *hashp = strchr(name, '#');
 		nextpair = nvlist_next_nvpair(innvl, pair);
 
 		if (hashp == NULL) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		*hashp = '\0';
 		error = zfs_secpolicy_write_perms(name,
 		    ZFS_DELEG_PERM_DESTROY, cr);
 		*hashp = '#';
 		if (error == ENOENT) {
 			/*
 			 * Ignore any filesystems that don't exist (we consider
 			 * their bookmarks "already destroyed").  Remove
 			 * the name from the nvl here in case the filesystem
 			 * is created between now and when we try to destroy
 			 * the bookmark (in which case we don't want to
 			 * destroy it since we haven't checked for permission).
 			 */
 			fnvlist_remove_nvpair(innvl, pair);
 			error = 0;
 		}
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	/*
 	 * Even root must have a proper TSD so that we know what pool
 	 * to log to.
 	 */
 	if (tsd_get(zfs_allow_log_key) == NULL)
 		return (SET_ERROR(EPERM));
 	return (0);
 }
 
 static int
 zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	char	parentname[MAXNAMELEN];
 	int	error;
 	char	*origin;
 
 	if ((error = zfs_get_parent(zc->zc_name, parentname,
 	    sizeof (parentname))) != 0)
 		return (error);
 
 	if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
 	    (error = zfs_secpolicy_write_perms(origin,
 	    ZFS_DELEG_PERM_CLONE, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
 		return (error);
 
 	return (zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_MOUNT, cr));
 }
 
 /*
  * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
  * SYS_CONFIG privilege, which is not available in a local zone.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	if (secpolicy_sys_config(cr, B_FALSE) != 0)
 		return (SET_ERROR(EPERM));
 
 	return (0);
 }
 
 /*
  * Policy for object to name lookups.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
 	if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
 		return (0);
 
 	error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
 	return (error);
 }
 
 /*
  * Policy for fault injection.  Requires all privileges.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (secpolicy_zinject(cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
 
 	if (prop == ZPROP_INVAL) {
 		if (!zfs_prop_user(zc->zc_value))
 			return (SET_ERROR(EINVAL));
 		return (zfs_secpolicy_write_perms(zc->zc_name,
 		    ZFS_DELEG_PERM_USERPROP, cr));
 	} else {
 		return (zfs_secpolicy_setprop(zc->zc_name, prop,
 		    NULL, cr));
 	}
 }
 
 static int
 zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int err = zfs_secpolicy_read(zc, innvl, cr);
 	if (err)
 		return (err);
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
 		return (SET_ERROR(EINVAL));
 
 	if (zc->zc_value[0] == 0) {
 		/*
 		 * They are asking about a posix uid/gid.  If it's
 		 * themself, allow it.
 		 */
 		if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
 		    zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
 			if (zc->zc_guid == crgetuid(cr))
 				return (0);
 		} else {
 			if (groupmember(zc->zc_guid, cr))
 				return (0);
 		}
 	}
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    userquota_perms[zc->zc_objset_type], cr));
 }
 
 static int
 zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int err = zfs_secpolicy_read(zc, innvl, cr);
 	if (err)
 		return (err);
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
 		return (SET_ERROR(EINVAL));
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    userquota_perms[zc->zc_objset_type], cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
 	    NULL, cr));
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvpair_t *pair;
 	nvlist_t *holds;
 	int error;
 
 	error = nvlist_lookup_nvlist(innvl, "holds", &holds);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(holds, pair)) {
 		char fsname[MAXNAMELEN];
 		error = dmu_fsname(nvpair_name(pair), fsname);
 		if (error != 0)
 			return (error);
 		error = zfs_secpolicy_write_perms(fsname,
 		    ZFS_DELEG_PERM_HOLD, cr);
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	nvpair_t *pair;
 	int error;
 
 	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(innvl, pair)) {
 		char fsname[MAXNAMELEN];
 		error = dmu_fsname(nvpair_name(pair), fsname);
 		if (error != 0)
 			return (error);
 		error = zfs_secpolicy_write_perms(fsname,
 		    ZFS_DELEG_PERM_RELEASE, cr);
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Policy for allowing temporary snapshots to be taken or released
  */
 static int
 zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	/*
 	 * A temporary snapshot is the same as a snapshot,
 	 * hold, destroy and release all rolled into one.
 	 * Delegated diff alone is sufficient that we allow this.
 	 */
 	int error;
 
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_DIFF, cr)) == 0)
 		return (0);
 
 	error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
 	if (error == 0)
 		error = zfs_secpolicy_hold(zc, innvl, cr);
 	if (error == 0)
 		error = zfs_secpolicy_release(zc, innvl, cr);
 	if (error == 0)
 		error = zfs_secpolicy_destroy(zc, innvl, cr);
 	return (error);
 }
 
 /*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
 static int
 get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
 {
 	char *packed;
 	int error;
 	nvlist_t *list = NULL;
 
 	/*
 	 * Read in and unpack the user-supplied nvlist.
 	 */
 	if (size == 0)
 		return (SET_ERROR(EINVAL));
 
 	packed = kmem_alloc(size, KM_SLEEP);
 
 	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
 	    iflag)) != 0) {
 		kmem_free(packed, size);
 		return (SET_ERROR(EFAULT));
 	}
 
 	if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
 
 	kmem_free(packed, size);
 
 	*nvp = list;
 	return (0);
 }
 
 /*
  * Reduce the size of this nvlist until it can be serialized in 'max' bytes.
  * Entries will be removed from the end of the nvlist, and one int32 entry
  * named "N_MORE_ERRORS" will be added indicating how many entries were
  * removed.
  */
 static int
 nvlist_smush(nvlist_t *errors, size_t max)
 {
 	size_t size;
 
 	size = fnvlist_size(errors);
 
 	if (size > max) {
 		nvpair_t *more_errors;
 		int n = 0;
 
 		if (max < 1024)
 			return (SET_ERROR(ENOMEM));
 
 		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
 		more_errors = nvlist_prev_nvpair(errors, NULL);
 
 		do {
 			nvpair_t *pair = nvlist_prev_nvpair(errors,
 			    more_errors);
 			fnvlist_remove_nvpair(errors, pair);
 			n++;
 			size = fnvlist_size(errors);
 		} while (size > max);
 
 		fnvlist_remove_nvpair(errors, more_errors);
 		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
 		ASSERT3U(fnvlist_size(errors), <=, max);
 	}
 
 	return (0);
 }
 
 static int
 put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	char *packed = NULL;
 	int error = 0;
 	size_t size;
 
 	size = fnvlist_size(nvl);
 
 	if (size > zc->zc_nvlist_dst_size) {
 		/*
 		 * Solaris returns ENOMEM here, because even if an error is
 		 * returned from an ioctl(2), new zc_nvlist_dst_size will be
 		 * passed to the userland. This is not the case for FreeBSD.
 		 * We need to return 0, so the kernel will copy the
 		 * zc_nvlist_dst_size back and the userland can discover that a
 		 * bigger buffer is needed.
 		 */
 		error = 0;
 	} else {
 		packed = fnvlist_pack(nvl, &size);
 		if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
 		    size, zc->zc_iflags) != 0)
 			error = SET_ERROR(EFAULT);
 		fnvlist_pack_free(packed, size);
 	}
 
 	zc->zc_nvlist_dst_size = size;
 	zc->zc_nvlist_dst_filled = B_TRUE;
 	return (error);
 }
 
 static int
 getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold(dsname, FTAG, &os);
 	if (error != 0)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	mutex_enter(&os->os_user_ptr_lock);
 	*zfvp = dmu_objset_get_user(os);
 	if (*zfvp) {
 		VFS_HOLD((*zfvp)->z_vfs);
 	} else {
 		error = SET_ERROR(ESRCH);
 	}
 	mutex_exit(&os->os_user_ptr_lock);
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 /*
  * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
  * case its z_vfs will be NULL, and it will be opened as the owner.
  * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
  * which prevents all vnode ops from running.
  */
 static int
 zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
 {
 	int error = 0;
 
 	if (getzfsvfs(name, zfvp) != 0)
 		error = zfsvfs_create(name, zfvp);
 	if (error == 0) {
 		rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
 		    RW_READER, tag);
 		if ((*zfvp)->z_unmounted) {
 			/*
 			 * XXX we could probably try again, since the unmounting
 			 * thread should be just about to disassociate the
 			 * objset from the zfsvfs.
 			 */
 			rrm_exit(&(*zfvp)->z_teardown_lock, tag);
 			return (SET_ERROR(EBUSY));
 		}
 	}
 	return (error);
 }
 
 static void
 zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
 {
 	rrm_exit(&zfsvfs->z_teardown_lock, tag);
 
 	if (zfsvfs->z_vfs) {
 		VFS_RELE(zfsvfs->z_vfs);
 	} else {
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	}
 }
 
 static int
 zfs_ioc_pool_create(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *config, *props = NULL;
 	nvlist_t *rootprops = NULL;
 	nvlist_t *zplprops = NULL;
 
 	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config))
 		return (error);
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
 	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))) {
 		nvlist_free(config);
 		return (error);
 	}
 
 	if (props) {
 		nvlist_t *nvl = NULL;
 		uint64_t version = SPA_VERSION;
 
 		(void) nvlist_lookup_uint64(props,
 		    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
 		if (!SPA_VERSION_IS_SUPPORTED(version)) {
 			error = SET_ERROR(EINVAL);
 			goto pool_props_bad;
 		}
 		(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
 		if (nvl) {
 			error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
 			if (error != 0) {
 				nvlist_free(config);
 				nvlist_free(props);
 				return (error);
 			}
 			(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
 		}
 		VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		error = zfs_fill_zplprops_root(version, rootprops,
 		    zplprops, NULL);
 		if (error != 0)
 			goto pool_props_bad;
 	}
 
 	error = spa_create(zc->zc_name, config, props, zplprops);
 
 	/*
 	 * Set the remaining root properties
 	 */
 	if (!error && (error = zfs_set_prop_nvlist(zc->zc_name,
 	    ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
 		(void) spa_destroy(zc->zc_name);
 
 pool_props_bad:
 	nvlist_free(rootprops);
 	nvlist_free(zplprops);
 	nvlist_free(config);
 	nvlist_free(props);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_destroy(zfs_cmd_t *zc)
 {
 	int error;
 	zfs_log_history(zc);
 	error = spa_destroy(zc->zc_name);
 	if (error == 0)
 		zvol_remove_minors(zc->zc_name);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_import(zfs_cmd_t *zc)
 {
 	nvlist_t *config, *props = NULL;
 	uint64_t guid;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)) != 0)
 		return (error);
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
 	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))) {
 		nvlist_free(config);
 		return (error);
 	}
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
 	    guid != zc->zc_guid)
 		error = SET_ERROR(EINVAL);
 	else
 		error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
 
 	if (zc->zc_nvlist_dst != 0) {
 		int err;
 
 		if ((err = put_nvlist(zc, config)) != 0)
 			error = err;
 	}
 
 	nvlist_free(config);
 
 	if (props)
 		nvlist_free(props);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_export(zfs_cmd_t *zc)
 {
 	int error;
 	boolean_t force = (boolean_t)zc->zc_cookie;
 	boolean_t hardforce = (boolean_t)zc->zc_guid;
 
 	zfs_log_history(zc);
 	error = spa_export(zc->zc_name, NULL, force, hardforce);
 	if (error == 0)
 		zvol_remove_minors(zc->zc_name);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_configs(zfs_cmd_t *zc)
 {
 	nvlist_t *configs;
 	int error;
 
 	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
 		return (SET_ERROR(EEXIST));
 
 	error = put_nvlist(zc, configs);
 
 	nvlist_free(configs);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of the pool
  *
  * outputs:
  * zc_cookie		real errno
  * zc_nvlist_dst	config nvlist
  * zc_nvlist_dst_size	size of config nvlist
  */
 static int
 zfs_ioc_pool_stats(zfs_cmd_t *zc)
 {
 	nvlist_t *config;
 	int error;
 	int ret = 0;
 
 	error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
 	    sizeof (zc->zc_value));
 
 	if (config != NULL) {
 		ret = put_nvlist(zc, config);
 		nvlist_free(config);
 
 		/*
 		 * The config may be present even if 'error' is non-zero.
 		 * In this case we return success, and preserve the real errno
 		 * in 'zc_cookie'.
 		 */
 		zc->zc_cookie = error;
 	} else {
 		ret = error;
 	}
 
 	return (ret);
 }
 
 /*
  * Try to import the given pool, returning pool stats as appropriate so that
  * user land knows which devices are available and overall pool health.
  */
 static int
 zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 {
 	nvlist_t *tryconfig, *config;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &tryconfig)) != 0)
 		return (error);
 
 	config = spa_tryimport(tryconfig);
 
 	nvlist_free(tryconfig);
 
 	if (config == NULL)
 		return (SET_ERROR(EINVAL));
 
 	error = put_nvlist(zc, config);
 	nvlist_free(config);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name              name of the pool
  * zc_cookie            scan func (pool_scan_func_t)
  */
 static int
 zfs_ioc_pool_scan(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (zc->zc_cookie == POOL_SCAN_NONE)
 		error = spa_scan_stop(spa);
 	else
 		error = spa_scan(spa, zc->zc_cookie);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_freeze(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
 		spa_freeze(spa);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 static int
 zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (zc->zc_cookie < spa_version(spa) ||
 	    !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	spa_upgrade(spa, zc->zc_cookie);
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *hist_buf;
 	uint64_t size;
 	int error;
 
 	if ((size = zc->zc_history_len) == 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	hist_buf = kmem_alloc(size, KM_SLEEP);
 	if ((error = spa_history_get(spa, &zc->zc_history_offset,
 	    &zc->zc_history_len, hist_buf)) == 0) {
 		error = ddi_copyout(hist_buf,
 		    (void *)(uintptr_t)zc->zc_history,
 		    zc->zc_history_len, zc->zc_iflags);
 	}
 
 	spa_close(spa, FTAG);
 	kmem_free(hist_buf, size);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_reguid(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
 		error = spa_change_guid(spa);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 static int
 zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
 {
 	return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_obj		object to find
  *
  * outputs:
  * zc_value		name of object
  */
 static int
 zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	/* XXX reading from objset not owned */
 	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
 	    sizeof (zc->zc_value));
 	dmu_objset_rele(os, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_obj		object to find
  *
  * outputs:
  * zc_stat		stats on object
  * zc_value		path to object
  */
 static int
 zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	/* XXX reading from objset not owned */
 	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
 	    sizeof (zc->zc_value));
 	dmu_objset_rele(os, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_add(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *config, **l2cache, **spares;
 	uint_t nl2cache = 0, nspares = 0;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config);
 	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache);
 
 	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares);
 
 #ifdef illumos
 	/*
 	 * A root pool with concatenated devices is not supported.
 	 * Thus, can not add a device to a root pool.
 	 *
 	 * Intent log device can not be added to a rootpool because
 	 * during mountroot, zil is replayed, a seperated log device
 	 * can not be accessed during the mountroot time.
 	 *
 	 * l2cache and spare devices are ok to be added to a rootpool.
 	 */
 	if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
 		nvlist_free(config);
 		spa_close(spa, FTAG);
 		return (SET_ERROR(EDOM));
 	}
 #endif /* illumos */
 
 	if (error == 0) {
 		error = spa_vdev_add(spa, config);
 		nvlist_free(config);
 	}
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of the pool
  * zc_nvlist_conf	nvlist of devices to remove
  * zc_cookie		to stop the remove?
  */
 static int
 zfs_ioc_vdev_remove(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 	switch (zc->zc_cookie) {
 	case VDEV_STATE_ONLINE:
 		error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
 		break;
 
 	case VDEV_STATE_OFFLINE:
 		error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	case VDEV_STATE_FAULTED:
 		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
 		    zc->zc_obj != VDEV_AUX_EXTERNAL)
 			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
 
 		error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	case VDEV_STATE_DEGRADED:
 		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
 		    zc->zc_obj != VDEV_AUX_EXTERNAL)
 			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
 
 		error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	default:
 		error = SET_ERROR(EINVAL);
 	}
 	zc->zc_cookie = newstate;
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int replacing = zc->zc_cookie;
 	nvlist_t *config;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)) == 0) {
 		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
 		nvlist_free(config);
 	}
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_split(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	nvlist_t *config, *props = NULL;
 	int error;
 	boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)) {
 		spa_close(spa, FTAG);
 		return (error);
 	}
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
 	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))) {
 		spa_close(spa, FTAG);
 		nvlist_free(config);
 		return (error);
 	}
 
 	error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
 
 	spa_close(spa, FTAG);
 
 	nvlist_free(config);
 	nvlist_free(props);
 
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *path = zc->zc_value;
 	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = spa_vdev_setpath(spa, guid, path);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *fru = zc->zc_value;
 	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = spa_vdev_setfru(spa, guid, fru);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 {
 	int error = 0;
 	nvlist_t *nv;
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	if (zc->zc_nvlist_dst != 0 &&
 	    (error = dsl_prop_get_all(os, &nv)) == 0) {
 		dmu_objset_stats(os, nv);
 		/*
 		 * NB: zvol_get_stats() will read the objset contents,
 		 * which we aren't supposed to do with a
 		 * DS_MODE_USER hold, because it could be
 		 * inconsistent.  So this is a bit of a workaround...
 		 * XXX reading with out owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
 		    dmu_objset_type(os) == DMU_OST_ZVOL) {
 			error = zvol_get_stats(os, nv);
 			if (error == EIO)
 				return (error);
 			VERIFY0(error);
 		}
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_dst_size	size of buffer for property nvlist
  *
  * outputs:
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
  */
 static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error == 0) {
 		error = zfs_ioc_objset_stats_impl(zc, os);
 		dmu_objset_rele(os, FTAG);
 	}
 
 	if (error == ENOMEM)
 		error = 0;
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_dst_size	size of buffer for property nvlist
  *
  * outputs:
  * zc_nvlist_dst	received property nvlist
  * zc_nvlist_dst_size	size of received property nvlist
  *
  * Gets received properties (distinct from local properties on or after
  * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
  * local property values.
  */
 static int
 zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
 {
 	int error = 0;
 	nvlist_t *nv;
 
 	/*
 	 * Without this check, we would return local property values if the
 	 * caller has not already received properties on or after
 	 * SPA_VERSION_RECVD_PROPS.
 	 */
 	if (!dsl_prop_get_hasrecvd(zc->zc_name))
 		return (SET_ERROR(ENOTSUP));
 
 	if (zc->zc_nvlist_dst != 0 &&
 	    (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
 	return (error);
 }
 
 static int
 nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
 {
 	uint64_t value;
 	int error;
 
 	/*
 	 * zfs_get_zplprop() will either find a value or give us
 	 * the default value (if there is one).
 	 */
 	if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
 		return (error);
 	VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
 	return (0);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_dst_size	size of buffer for zpl property nvlist
  *
  * outputs:
  * zc_nvlist_dst	zpl property nvlist
  * zc_nvlist_dst_size	size of zpl property nvlist
  */
 static int
 zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int err;
 
 	/* XXX reading without owning */
 	if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
 		return (err);
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	/*
 	 * NB: nvl_add_zplprop() will read the objset contents,
 	 * which we aren't supposed to do with a DS_MODE_USER
 	 * hold, because it could be inconsistent.
 	 */
 	if (zc->zc_nvlist_dst != 0 &&
 	    !zc->zc_objset_stats.dds_inconsistent &&
 	    dmu_objset_type(os) == DMU_OST_ZFS) {
 		nvlist_t *nv;
 
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
 			err = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	} else {
 		err = SET_ERROR(ENOENT);
 	}
 	dmu_objset_rele(os, FTAG);
 	return (err);
 }
 
 boolean_t
 dataset_name_hidden(const char *name)
 {
 	/*
 	 * Skip over datasets that are not visible in this zone,
 	 * internal datasets (which have a $ in their name), and
 	 * temporary datasets (which have a % in their name).
 	 */
 	if (strchr(name, '$') != NULL)
 		return (B_TRUE);
 	if (strchr(name, '%') != NULL)
 		return (B_TRUE);
 	if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_nvlist_dst_size	size of buffer for property nvlist
  *
  * outputs:
  * zc_name		name of next filesystem
  * zc_cookie		zap cursor
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
  */
 static int
 zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 	char *p;
 	size_t orig_len = strlen(zc->zc_name);
 
 top:
 	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
 		if (error == ENOENT)
 			error = SET_ERROR(ESRCH);
 		return (error);
 	}
 
 	p = strrchr(zc->zc_name, '/');
 	if (p == NULL || p[1] != '\0')
 		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
 	p = zc->zc_name + strlen(zc->zc_name);
 
 	do {
 		error = dmu_dir_list_next(os,
 		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = SET_ERROR(ESRCH);
 	} while (error == 0 && dataset_name_hidden(zc->zc_name));
 	dmu_objset_rele(os, FTAG);
 
 	/*
 	 * If it's an internal dataset (ie. with a '$' in its name),
 	 * don't try to get stats for it, otherwise we'll return ENOENT.
 	 */
 	if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 		if (error == ENOENT) {
 			/* We lost a race with destroy, get the next one. */
 			zc->zc_name[orig_len] = '\0';
 			goto top;
 		}
 	}
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_nvlist_dst_size	size of buffer for property nvlist
  * zc_simple		when set, only name is requested
  *
  * outputs:
  * zc_name		name of next snapshot
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
  */
 static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0) {
 		return (error == ENOENT ? ESRCH : error);
 	}
 
 	/*
 	 * A dataset name of maximum length cannot have any snapshots,
 	 * so exit immediately.
 	 */
 	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(ESRCH));
 	}
 
 	error = dmu_snapshot_list_next(os,
 	    sizeof (zc->zc_name) - strlen(zc->zc_name),
 	    zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie,
 	    NULL);
 
 	if (error == 0 && !zc->zc_simple) {
 		dsl_dataset_t *ds;
 		dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
 
 		error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
 		if (error == 0) {
 			objset_t *ossnap;
 
 			error = dmu_objset_from_ds(ds, &ossnap);
 			if (error == 0)
 				error = zfs_ioc_objset_stats_impl(zc, ossnap);
 			dsl_dataset_rele(ds, FTAG);
 		}
 	} else if (error == ENOENT) {
 		error = SET_ERROR(ESRCH);
 	}
 
 	dmu_objset_rele(os, FTAG);
 	/* if we failed, undo the @ that we tacked on to zc_name */
 	if (error != 0)
 		*strchr(zc->zc_name, '@') = '\0';
 	return (error);
 }
 
 static int
 zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
 {
 	const char *propname = nvpair_name(pair);
 	uint64_t *valary;
 	unsigned int vallen;
 	const char *domain;
 	char *dash;
 	zfs_userquota_prop_t type;
 	uint64_t rid;
 	uint64_t quota;
 	zfsvfs_t *zfsvfs;
 	int err;
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
 		if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &pair) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * A correctly constructed propname is encoded as
 	 * userquota@<rid>-<domain>.
 	 */
 	if ((dash = strchr(propname, '-')) == NULL ||
 	    nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
 	    vallen != 3)
 		return (SET_ERROR(EINVAL));
 
 	domain = dash + 1;
 	type = valary[0];
 	rid = valary[1];
 	quota = valary[2];
 
 	err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
 	if (err == 0) {
 		err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
 		zfsvfs_rele(zfsvfs, FTAG);
 	}
 
 	return (err);
 }
 
 /*
  * If the named property is one that has a special function to set its value,
  * return 0 on success and a positive error code on failure; otherwise if it is
  * not one of the special properties handled by this function, return -1.
  *
  * XXX: It would be better for callers of the property interface if we handled
  * these special cases in dsl_prop.c (in the dsl layer).
  */
 static int
 zfs_prop_set_special(const char *dsname, zprop_source_t source,
     nvpair_t *pair)
 {
 	const char *propname = nvpair_name(pair);
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	uint64_t intval;
 	int err = -1;
 
 	if (prop == ZPROP_INVAL) {
 		if (zfs_prop_userquota(propname))
 			return (zfs_prop_set_userquota(dsname, pair));
 		return (-1);
 	}
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
 		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &pair) == 0);
 	}
 
 	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
 		return (-1);
 
 	VERIFY(0 == nvpair_value_uint64(pair, &intval));
 
 	switch (prop) {
 	case ZFS_PROP_QUOTA:
 		err = dsl_dir_set_quota(dsname, source, intval);
 		break;
 	case ZFS_PROP_REFQUOTA:
 		err = dsl_dataset_set_refquota(dsname, source, intval);
 		break;
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 		if (intval == UINT64_MAX) {
 			/* clearing the limit, just do it */
 			err = 0;
 		} else {
 			err = dsl_dir_activate_fs_ss_limit(dsname);
 		}
 		/*
 		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
 		 * default path to set the value in the nvlist.
 		 */
 		if (err == 0)
 			err = -1;
 		break;
 	case ZFS_PROP_RESERVATION:
 		err = dsl_dir_set_reservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_REFRESERVATION:
 		err = dsl_dataset_set_refreservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_VOLSIZE:
 		err = zvol_set_volsize(dsname, intval);
 		break;
 	case ZFS_PROP_VERSION:
 	{
 		zfsvfs_t *zfsvfs;
 
 		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
 			break;
 
 		err = zfs_set_version(zfsvfs, intval);
 		zfsvfs_rele(zfsvfs, FTAG);
 
 		if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
 			zfs_cmd_t *zc;
 
 			zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 			(void) strcpy(zc->zc_name, dsname);
 			(void) zfs_ioc_userspace_upgrade(zc);
 			kmem_free(zc, sizeof (zfs_cmd_t));
 		}
 		break;
 	}
 	default:
 		err = -1;
 	}
 
 	return (err);
 }
 
 /*
  * This function is best effort. If it fails to set any of the given properties,
  * it continues to set as many as it can and returns the last error
  * encountered. If the caller provides a non-NULL errlist, it will be filled in
  * with the list of names of all the properties that failed along with the
  * corresponding error numbers.
  *
  * If every property is set successfully, zero is returned and errlist is not
  * modified.
  */
 int
 zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
     nvlist_t *errlist)
 {
 	nvpair_t *pair;
 	nvpair_t *propval;
 	int rv = 0;
 	uint64_t intval;
 	char *strval;
 	nvlist_t *genericnvl = fnvlist_alloc();
 	nvlist_t *retrynvl = fnvlist_alloc();
 
 retry:
 	pair = NULL;
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
 		zfs_prop_t prop = zfs_name_to_prop(propname);
 		int err = 0;
 
 		/* decode the property value */
 		propval = pair;
 		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 			nvlist_t *attrs;
 			attrs = fnvpair_value_nvlist(pair);
 			if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 			    &propval) != 0)
 				err = SET_ERROR(EINVAL);
 		}
 
 		/* Validate value type */
 		if (err == 0 && prop == ZPROP_INVAL) {
 			if (zfs_prop_user(propname)) {
 				if (nvpair_type(propval) != DATA_TYPE_STRING)
 					err = SET_ERROR(EINVAL);
 			} else if (zfs_prop_userquota(propname)) {
 				if (nvpair_type(propval) !=
 				    DATA_TYPE_UINT64_ARRAY)
 					err = SET_ERROR(EINVAL);
 			} else {
 				err = SET_ERROR(EINVAL);
 			}
 		} else if (err == 0) {
 			if (nvpair_type(propval) == DATA_TYPE_STRING) {
 				if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
 					err = SET_ERROR(EINVAL);
 			} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
 				const char *unused;
 
 				intval = fnvpair_value_uint64(propval);
 
 				switch (zfs_prop_get_type(prop)) {
 				case PROP_TYPE_NUMBER:
 					break;
 				case PROP_TYPE_STRING:
 					err = SET_ERROR(EINVAL);
 					break;
 				case PROP_TYPE_INDEX:
 					if (zfs_prop_index_to_string(prop,
 					    intval, &unused) != 0)
 						err = SET_ERROR(EINVAL);
 					break;
 				default:
 					cmn_err(CE_PANIC,
 					    "unknown property type");
 				}
 			} else {
 				err = SET_ERROR(EINVAL);
 			}
 		}
 
 		/* Validate permissions */
 		if (err == 0)
 			err = zfs_check_settable(dsname, pair, CRED());
 
 		if (err == 0) {
 			err = zfs_prop_set_special(dsname, source, pair);
 			if (err == -1) {
 				/*
 				 * For better performance we build up a list of
 				 * properties to set in a single transaction.
 				 */
 				err = nvlist_add_nvpair(genericnvl, pair);
 			} else if (err != 0 && nvl != retrynvl) {
 				/*
 				 * This may be a spurious error caused by
 				 * receiving quota and reservation out of order.
 				 * Try again in a second pass.
 				 */
 				err = nvlist_add_nvpair(retrynvl, pair);
 			}
 		}
 
 		if (err != 0) {
 			if (errlist != NULL)
 				fnvlist_add_int32(errlist, propname, err);
 			rv = err;
 		}
 	}
 
 	if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
 		nvl = retrynvl;
 		goto retry;
 	}
 
 	if (!nvlist_empty(genericnvl) &&
 	    dsl_props_set(dsname, source, genericnvl) != 0) {
 		/*
 		 * If this fails, we still want to set as many properties as we
 		 * can, so try setting them individually.
 		 */
 		pair = NULL;
 		while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
 			const char *propname = nvpair_name(pair);
 			int err = 0;
 
 			propval = pair;
 			if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 				nvlist_t *attrs;
 				attrs = fnvpair_value_nvlist(pair);
 				propval = fnvlist_lookup_nvpair(attrs,
 				    ZPROP_VALUE);
 			}
 
 			if (nvpair_type(propval) == DATA_TYPE_STRING) {
 				strval = fnvpair_value_string(propval);
 				err = dsl_prop_set_string(dsname, propname,
 				    source, strval);
 			} else {
 				intval = fnvpair_value_uint64(propval);
 				err = dsl_prop_set_int(dsname, propname, source,
 				    intval);
 			}
 
 			if (err != 0) {
 				if (errlist != NULL) {
 					fnvlist_add_int32(errlist, propname,
 					    err);
 				}
 				rv = err;
 			}
 		}
 	}
 	nvlist_free(genericnvl);
 	nvlist_free(retrynvl);
 
 	return (rv);
 }
 
 /*
  * Check that all the properties are valid user properties.
  */
 static int
 zfs_check_userprops(const char *fsname, nvlist_t *nvl)
 {
 	nvpair_t *pair = NULL;
 	int error = 0;
 
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
 
 		if (!zfs_prop_user(propname) ||
 		    nvpair_type(pair) != DATA_TYPE_STRING)
 			return (SET_ERROR(EINVAL));
 
 		if (error = zfs_secpolicy_write_perms(fsname,
 		    ZFS_DELEG_PERM_USERPROP, CRED()))
 			return (error);
 
 		if (strlen(propname) >= ZAP_MAXNAMELEN)
 			return (SET_ERROR(ENAMETOOLONG));
 
 		if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
 			return (E2BIG);
 	}
 	return (0);
 }
 
 static void
 props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
 {
 	nvpair_t *pair;
 
 	VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	pair = NULL;
 	while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
 		if (nvlist_exists(skipped, nvpair_name(pair)))
 			continue;
 
 		VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
 	}
 }
 
 static int
 clear_received_props(const char *dsname, nvlist_t *props,
     nvlist_t *skipped)
 {
 	int err = 0;
 	nvlist_t *cleared_props = NULL;
 	props_skip(props, skipped, &cleared_props);
 	if (!nvlist_empty(cleared_props)) {
 		/*
 		 * Acts on local properties until the dataset has received
 		 * properties at least once on or after SPA_VERSION_RECVD_PROPS.
 		 */
 		zprop_source_t flags = (ZPROP_SRC_NONE |
 		    (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
 		err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
 	}
 	nvlist_free(cleared_props);
 	return (err);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_value		name of property to set
  * zc_nvlist_src{_size}	nvlist of properties to apply
  * zc_cookie		received properties flag
  *
  * outputs:
  * zc_nvlist_dst{_size} error for each unapplied received property
  */
 static int
 zfs_ioc_set_prop(zfs_cmd_t *zc)
 {
 	nvlist_t *nvl;
 	boolean_t received = zc->zc_cookie;
 	zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
 	    ZPROP_SRC_LOCAL);
 	nvlist_t *errors;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &nvl)) != 0)
 		return (error);
 
 	if (received) {
 		nvlist_t *origprops;
 
 		if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
 			(void) clear_received_props(zc->zc_name,
 			    origprops, nvl);
 			nvlist_free(origprops);
 		}
 
 		error = dsl_prop_set_hasrecvd(zc->zc_name);
 	}
 
 	errors = fnvlist_alloc();
 	if (error == 0)
 		error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
 
 	if (zc->zc_nvlist_dst != 0 && errors != NULL) {
 		(void) put_nvlist(zc, errors);
 	}
 
 	nvlist_free(errors);
 	nvlist_free(nvl);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_value		name of property to inherit
  * zc_cookie		revert to received value if TRUE
  *
  * outputs:		none
  */
 static int
 zfs_ioc_inherit_prop(zfs_cmd_t *zc)
 {
 	const char *propname = zc->zc_value;
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	boolean_t received = zc->zc_cookie;
 	zprop_source_t source = (received
 	    ? ZPROP_SRC_NONE		/* revert to received value, if any */
 	    : ZPROP_SRC_INHERITED);	/* explicitly inherit */
 
 	if (received) {
 		nvlist_t *dummy;
 		nvpair_t *pair;
 		zprop_type_t type;
 		int err;
 
 		/*
 		 * zfs_prop_set_special() expects properties in the form of an
 		 * nvpair with type info.
 		 */
 		if (prop == ZPROP_INVAL) {
 			if (!zfs_prop_user(propname))
 				return (SET_ERROR(EINVAL));
 
 			type = PROP_TYPE_STRING;
 		} else if (prop == ZFS_PROP_VOLSIZE ||
 		    prop == ZFS_PROP_VERSION) {
 			return (SET_ERROR(EINVAL));
 		} else {
 			type = zfs_prop_get_type(prop);
 		}
 
 		VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		switch (type) {
 		case PROP_TYPE_STRING:
 			VERIFY(0 == nvlist_add_string(dummy, propname, ""));
 			break;
 		case PROP_TYPE_NUMBER:
 		case PROP_TYPE_INDEX:
 			VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
 			break;
 		default:
 			nvlist_free(dummy);
 			return (SET_ERROR(EINVAL));
 		}
 
 		pair = nvlist_next_nvpair(dummy, NULL);
 		err = zfs_prop_set_special(zc->zc_name, source, pair);
 		nvlist_free(dummy);
 		if (err != -1)
 			return (err); /* special property already handled */
 	} else {
 		/*
 		 * Only check this in the non-received case. We want to allow
 		 * 'inherit -S' to revert non-inheritable properties like quota
 		 * and reservation to the received or default values even though
 		 * they are not considered inheritable.
 		 */
 		if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
 			return (SET_ERROR(EINVAL));
 	}
 
 	/* property name has been validated by zfs_secpolicy_inherit_prop() */
 	return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
 }
 
 static int
 zfs_ioc_pool_set_props(zfs_cmd_t *zc)
 {
 	nvlist_t *props;
 	spa_t *spa;
 	int error;
 	nvpair_t *pair;
 
 	if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))
 		return (error);
 
 	/*
 	 * If the only property is the configfile, then just do a spa_lookup()
 	 * to handle the faulted case.
 	 */
 	pair = nvlist_next_nvpair(props, NULL);
 	if (pair != NULL && strcmp(nvpair_name(pair),
 	    zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
 	    nvlist_next_nvpair(props, pair) == NULL) {
 		mutex_enter(&spa_namespace_lock);
 		if ((spa = spa_lookup(zc->zc_name)) != NULL) {
 			spa_configfile_set(spa, props, B_FALSE);
 			spa_config_sync(spa, B_FALSE, B_TRUE);
 		}
 		mutex_exit(&spa_namespace_lock);
 		if (spa != NULL) {
 			nvlist_free(props);
 			return (0);
 		}
 	}
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
 		nvlist_free(props);
 		return (error);
 	}
 
 	error = spa_prop_set(spa, props);
 
 	nvlist_free(props);
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_get_props(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *nvp = NULL;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
 		/*
 		 * If the pool is faulted, there may be properties we can still
 		 * get (such as altroot and cachefile), so attempt to get them
 		 * anyway.
 		 */
 		mutex_enter(&spa_namespace_lock);
 		if ((spa = spa_lookup(zc->zc_name)) != NULL)
 			error = spa_prop_get(spa, &nvp);
 		mutex_exit(&spa_namespace_lock);
 	} else {
 		error = spa_prop_get(spa, &nvp);
 		spa_close(spa, FTAG);
 	}
 
 	if (error == 0 && zc->zc_nvlist_dst != 0)
 		error = put_nvlist(zc, nvp);
 	else
 		error = SET_ERROR(EFAULT);
 
 	nvlist_free(nvp);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_src{_size}	nvlist of delegated permissions
  * zc_perm_action	allow/unallow flag
  *
  * outputs:		none
  */
 static int
 zfs_ioc_set_fsacl(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *fsaclnv = NULL;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &fsaclnv)) != 0)
 		return (error);
 
 	/*
 	 * Verify nvlist is constructed correctly
 	 */
 	if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
 		nvlist_free(fsaclnv);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If we don't have PRIV_SYS_MOUNT, then validate
 	 * that user is allowed to hand out each permission in
 	 * the nvlist(s)
 	 */
 
 	error = secpolicy_zfs(CRED());
 	if (error != 0) {
 		if (zc->zc_perm_action == B_FALSE) {
 			error = dsl_deleg_can_allow(zc->zc_name,
 			    fsaclnv, CRED());
 		} else {
 			error = dsl_deleg_can_unallow(zc->zc_name,
 			    fsaclnv, CRED());
 		}
 	}
 
 	if (error == 0)
 		error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
 
 	nvlist_free(fsaclnv);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  *
  * outputs:
  * zc_nvlist_src{_size}	nvlist of delegated permissions
  */
 static int
 zfs_ioc_get_fsacl(zfs_cmd_t *zc)
 {
 	nvlist_t *nvp;
 	int error;
 
 	if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
 		error = put_nvlist(zc, nvp);
 		nvlist_free(nvp);
 	}
 
 	return (error);
 }
 
 /*
  * Search the vfs list for a specified resource.  Returns a pointer to it
  * or NULL if no suitable entry is found. The caller of this routine
  * is responsible for releasing the returned vfs pointer.
  */
 static vfs_t *
 zfs_get_vfs(const char *resource)
 {
 	vfs_t *vfsp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
 		if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
 			VFS_HOLD(vfsp);
 			break;
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return (vfsp);
 }
 
 /* ARGSUSED */
 static void
 zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	zfs_creat_t *zct = arg;
 
 	zfs_create_fs(os, cr, zct->zct_zplprops, tx);
 }
 
 #define	ZFS_PROP_UNDEFINED	((uint64_t)-1)
 
 /*
  * inputs:
  * os			parent objset pointer (NULL if root fs)
  * fuids_ok		fuids allowed in this version of the spa?
  * sa_ok		SAs allowed in this version of the spa?
  * createprops		list of properties requested by creator
  *
  * outputs:
  * zplprops	values for the zplprops we attach to the master node object
  * is_ci	true if requested file system will be purely case-insensitive
  *
  * Determine the settings for utf8only, normalization and
  * casesensitivity.  Specific values may have been requested by the
  * creator and/or we can inherit values from the parent dataset.  If
  * the file system is of too early a vintage, a creator can not
  * request settings for these properties, even if the requested
  * setting is the default value.  We don't actually want to create dsl
  * properties for these, so remove them from the source nvlist after
  * processing.
  */
 static int
 zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
     boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
 
 	ASSERT(zplprops != NULL);
 
 	/*
 	 * Pull out creator prop choices, if any.
 	 */
 	if (createprops) {
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_NORMALIZE));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_CASE), &sense);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_CASE));
 	}
 
 	/*
 	 * If the zpl version requested is whacky or the file system
 	 * or pool is version is too "young" to support normalization
 	 * and the creator tried to set a value for one of the props,
 	 * error out.
 	 */
 	if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
 	    (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
 	    (zplver >= ZPL_VERSION_SA && !sa_ok) ||
 	    (zplver < ZPL_VERSION_NORMALIZATION &&
 	    (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
 	    sense != ZFS_PROP_UNDEFINED)))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Put the version in the zplprops
 	 */
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
 
 	if (norm == ZFS_PROP_UNDEFINED)
 		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
 
 	/*
 	 * If we're normalizing, names must always be valid UTF-8 strings.
 	 */
 	if (norm)
 		u8 = 1;
 	if (u8 == ZFS_PROP_UNDEFINED)
 		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
 
 	if (sense == ZFS_PROP_UNDEFINED)
 		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
 
 	if (is_ci)
 		*is_ci = (sense == ZFS_CASE_INSENSITIVE);
 
 	return (0);
 }
 
 static int
 zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
 	boolean_t fuids_ok, sa_ok;
 	uint64_t zplver = ZPL_VERSION;
 	objset_t *os = NULL;
 	char parentname[MAXNAMELEN];
 	char *cp;
 	spa_t *spa;
 	uint64_t spa_vers;
 	int error;
 
 	(void) strlcpy(parentname, dataset, sizeof (parentname));
 	cp = strrchr(parentname, '/');
 	ASSERT(cp != NULL);
 	cp[0] = '\0';
 
 	if ((error = spa_open(dataset, &spa, FTAG)) != 0)
 		return (error);
 
 	spa_vers = spa_version(spa);
 	spa_close(spa, FTAG);
 
 	zplver = zfs_zpl_version_map(spa_vers);
 	fuids_ok = (zplver >= ZPL_VERSION_FUID);
 	sa_ok = (zplver >= ZPL_VERSION_SA);
 
 	/*
 	 * Open parent object set so we can inherit zplprop values.
 	 */
 	if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
 		return (error);
 
 	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
 	    zplprops, is_ci);
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 static int
 zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
 	boolean_t fuids_ok;
 	boolean_t sa_ok;
 	uint64_t zplver = ZPL_VERSION;
 	int error;
 
 	zplver = zfs_zpl_version_map(spa_vers);
 	fuids_ok = (zplver >= ZPL_VERSION_FUID);
 	sa_ok = (zplver >= ZPL_VERSION_SA);
 
 	error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
 	    createprops, zplprops, is_ci);
 	return (error);
 }
 
 /*
  * innvl: {
  *     "type" -> dmu_objset_type_t (int32)
  *     (optional) "props" -> { prop -> value }
  * }
  *
  * outnvl: propname -> error code (int32)
  */
 static int
 zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error = 0;
 	zfs_creat_t zct = { 0 };
 	nvlist_t *nvprops = NULL;
 	void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 	int32_t type32;
 	dmu_objset_type_t type;
 	boolean_t is_insensitive = B_FALSE;
 
 	if (nvlist_lookup_int32(innvl, "type", &type32) != 0)
 		return (SET_ERROR(EINVAL));
 	type = type32;
 	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
 
 	switch (type) {
 	case DMU_OST_ZFS:
 		cbfunc = zfs_create_cb;
 		break;
 
 	case DMU_OST_ZVOL:
 		cbfunc = zvol_create_cb;
 		break;
 
 	default:
 		cbfunc = NULL;
 		break;
 	}
 	if (strchr(fsname, '@') ||
 	    strchr(fsname, '%'))
 		return (SET_ERROR(EINVAL));
 
 	zct.zct_props = nvprops;
 
 	if (cbfunc == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (type == DMU_OST_ZVOL) {
 		uint64_t volsize, volblocksize;
 
 		if (nvprops == NULL)
 			return (SET_ERROR(EINVAL));
 		if (nvlist_lookup_uint64(nvprops,
 		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
 			return (SET_ERROR(EINVAL));
 
 		if ((error = nvlist_lookup_uint64(nvprops,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 		    &volblocksize)) != 0 && error != ENOENT)
 			return (SET_ERROR(EINVAL));
 
 		if (error != 0)
 			volblocksize = zfs_prop_default_numeric(
 			    ZFS_PROP_VOLBLOCKSIZE);
 
 		if ((error = zvol_check_volblocksize(
 		    volblocksize)) != 0 ||
 		    (error = zvol_check_volsize(volsize,
 		    volblocksize)) != 0)
 			return (error);
 	} else if (type == DMU_OST_ZFS) {
 		int error;
 
 		/*
 		 * We have to have normalization and
 		 * case-folding flags correct when we do the
 		 * file system creation, so go figure them out
 		 * now.
 		 */
 		VERIFY(nvlist_alloc(&zct.zct_zplprops,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		error = zfs_fill_zplprops(fsname, nvprops,
 		    zct.zct_zplprops, &is_insensitive);
 		if (error != 0) {
 			nvlist_free(zct.zct_zplprops);
 			return (error);
 		}
 	}
 
 	error = dmu_objset_create(fsname, type,
 	    is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
 	nvlist_free(zct.zct_zplprops);
 
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
 		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
 		    nvprops, outnvl);
 		if (error != 0)
 			(void) dsl_destroy_head(fsname);
 	}
 #ifdef __FreeBSD__
 	if (error == 0 && type == DMU_OST_ZVOL)
 		zvol_create_minors(fsname);
 #endif
 	return (error);
 }
 
 /*
  * innvl: {
  *     "origin" -> name of origin snapshot
  *     (optional) "props" -> { prop -> value }
  * }
  *
  * outnvl: propname -> error code (int32)
  */
 static int
 zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error = 0;
 	nvlist_t *nvprops = NULL;
 	char *origin_name;
 
 	if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0)
 		return (SET_ERROR(EINVAL));
 	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
 
 	if (strchr(fsname, '@') ||
 	    strchr(fsname, '%'))
 		return (SET_ERROR(EINVAL));
 
 	if (dataset_namecheck(origin_name, NULL, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 	error = dmu_objset_clone(fsname, origin_name);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
 		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
 		    nvprops, outnvl);
 		if (error != 0)
 			(void) dsl_destroy_head(fsname);
 	}
 #ifdef __FreeBSD__
 	if (error == 0)
 		zvol_create_minors(fsname);
 #endif
 	return (error);
 }
 
 /*
  * innvl: {
  *     "snaps" -> { snapshot1, snapshot2 }
  *     (optional) "props" -> { prop -> value (string) }
  * }
  *
  * outnvl: snapshot -> error code (int32)
  */
 static int
 zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	nvlist_t *snaps;
 	nvlist_t *props = NULL;
 	int error, poollen;
 	nvpair_t *pair;
 
 	(void) nvlist_lookup_nvlist(innvl, "props", &props);
 	if ((error = zfs_check_userprops(poolname, props)) != 0)
 		return (error);
 
 	if (!nvlist_empty(props) &&
 	    zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
 		return (SET_ERROR(ENOTSUP));
 
 	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
 		return (SET_ERROR(EINVAL));
 	poollen = strlen(poolname);
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		const char *name = nvpair_name(pair);
 		const char *cp = strchr(name, '@');
 
 		/*
 		 * The snap name must contain an @, and the part after it must
 		 * contain only valid characters.
 		 */
 		if (cp == NULL ||
 		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * The snap must be in the specified pool.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '@'))
 			return (SET_ERROR(EXDEV));
 
 		/* This must be the only snap of this fs. */
 		for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
 		    pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
 			if (strncmp(name, nvpair_name(pair2), cp - name + 1)
 			    == 0) {
 				return (SET_ERROR(EXDEV));
 			}
 		}
 	}
 
 	error = dsl_dataset_snapshot(snaps, props, outnvl);
 	return (error);
 }
 
 /*
  * innvl: "message" -> string
  */
 /* ARGSUSED */
 static int
 zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	char *message;
 	spa_t *spa;
 	int error;
 	char *poolname;
 
 	/*
 	 * The poolname in the ioctl is not set, we get it from the TSD,
 	 * which was set at the end of the last successful ioctl that allows
 	 * logging.  The secpolicy func already checked that it is set.
 	 * Only one log ioctl is allowed after each successful ioctl, so
 	 * we clear the TSD here.
 	 */
 	poolname = tsd_get(zfs_allow_log_key);
 	(void) tsd_set(zfs_allow_log_key, NULL);
 	error = spa_open(poolname, &spa, FTAG);
 	strfree(poolname);
 	if (error != 0)
 		return (error);
 
 	if (nvlist_lookup_string(innvl, "message", &message) != 0)  {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	error = spa_history_log(spa, message);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 /*
  * The dp_config_rwlock must not be held when calling this, because the
  * unmount may need to write out data.
  *
  * This function is best-effort.  Callers must deal gracefully if it
  * remains mounted (or is remounted after this call).
  *
  * Returns 0 if the argument is not a snapshot, or it is not currently a
  * filesystem, or we were able to unmount it.  Returns error code otherwise.
  */
 int
 zfs_unmount_snap(const char *snapname)
 {
 	vfs_t *vfsp;
 	zfsvfs_t *zfsvfs;
 	int err;
 
 	if (strchr(snapname, '@') == NULL)
 		return (0);
 
 	vfsp = zfs_get_vfs(snapname);
 	if (vfsp == NULL)
 		return (0);
 
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
 
 	err = vn_vfswlock(vfsp->vfs_vnodecovered);
 	VFS_RELE(vfsp);
 	if (err != 0)
 		return (SET_ERROR(err));
 
 	/*
 	 * Always force the unmount for snapshots.
 	 */
 
 #ifdef illumos
 	(void) dounmount(vfsp, MS_FORCE, kcred);
 #else
 	vfs_ref(vfsp);
 	(void) dounmount(vfsp, MS_FORCE, curthread);
 #endif
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_unmount_snap_cb(const char *snapname, void *arg)
 {
 	return (zfs_unmount_snap(snapname));
 }
 
 /*
  * When a clone is destroyed, its origin may also need to be destroyed,
  * in which case it must be unmounted.  This routine will do that unmount
  * if necessary.
  */
 void
 zfs_destroy_unmount_origin(const char *fsname)
 {
 	int error;
 	objset_t *os;
 	dsl_dataset_t *ds;
 
 	error = dmu_objset_hold(fsname, FTAG, &os);
 	if (error != 0)
 		return;
 	ds = dmu_objset_ds(os);
 	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
 		char originname[MAXNAMELEN];
 		dsl_dataset_name(ds->ds_prev, originname);
 		dmu_objset_rele(os, FTAG);
 		(void) zfs_unmount_snap(originname);
 	} else {
 		dmu_objset_rele(os, FTAG);
 	}
 }
 
 /*
  * innvl: {
  *     "snaps" -> { snapshot1, snapshot2 }
  *     (optional boolean) "defer"
  * }
  *
  * outnvl: snapshot -> error code (int32)
  *
  */
 /* ARGSUSED */
 static int
 zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error, poollen;
 	nvlist_t *snaps;
 	nvpair_t *pair;
 	boolean_t defer;
 
 	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
 		return (SET_ERROR(EINVAL));
 	defer = nvlist_exists(innvl, "defer");
 
 	poollen = strlen(poolname);
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		const char *name = nvpair_name(pair);
 
 		/*
 		 * The snap must be in the specified pool to prevent the
 		 * invalid removal of zvol minors below.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '@'))
 			return (SET_ERROR(EXDEV));
 
 		error = zfs_unmount_snap(name);
 		if (error != 0)
 			return (error);
 #if defined(__FreeBSD__)
 		zvol_remove_minors(name);
 #endif
 	}
 
 	return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
 }
 
 /*
  * Create bookmarks.  Bookmark names are of the form <fs>#<bmark>.
  * All bookmarks must be in the same pool.
  *
  * innvl: {
  *     bookmark1 -> snapshot1, bookmark2 -> snapshot2
  * }
  *
  * outnvl: bookmark -> error code (int32)
  *
  */
 /* ARGSUSED */
 static int
 zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
 		char *snap_name;
 
 		/*
 		 * Verify the snapshot argument.
 		 */
 		if (nvpair_value_string(pair, &snap_name) != 0)
 			return (SET_ERROR(EINVAL));
 
 
 		/* Verify that the keys (bookmarks) are unique */
 		for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
 		    pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
 			if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
 				return (SET_ERROR(EINVAL));
 		}
 	}
 
 	return (dsl_bookmark_create(innvl, outnvl));
 }
 
 /*
  * innvl: {
  *     property 1, property 2, ...
  * }
  *
  * outnvl: {
  *     bookmark name 1 -> { property 1, property 2, ... },
  *     bookmark name 2 -> { property 1, property 2, ... }
  * }
  *
  */
 static int
 zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	return (dsl_get_bookmarks(fsname, innvl, outnvl));
 }
 
 /*
  * innvl: {
  *     bookmark name 1, bookmark name 2
  * }
  *
  * outnvl: bookmark -> error code (int32)
  *
  */
 static int
 zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
     nvlist_t *outnvl)
 {
 	int error, poollen;
 
 	poollen = strlen(poolname);
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
 		const char *name = nvpair_name(pair);
 		const char *cp = strchr(name, '#');
 
 		/*
 		 * The bookmark name must contain an #, and the part after it
 		 * must contain only valid characters.
 		 */
 		if (cp == NULL ||
 		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * The bookmark must be in the specified pool.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '#'))
 			return (SET_ERROR(EXDEV));
 	}
 
 	error = dsl_bookmark_destroy(innvl, outnvl);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of dataset to destroy
  * zc_objset_type	type of objset
  * zc_defer_destroy	mark for deferred destroy
  *
  * outputs:		none
  */
 static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
 	int err;
 
 	if (zc->zc_objset_type == DMU_OST_ZFS) {
 		err = zfs_unmount_snap(zc->zc_name);
 		if (err != 0)
 			return (err);
 	}
 
 	if (strchr(zc->zc_name, '@'))
 		err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
 	else
 		err = dsl_destroy_head(zc->zc_name);
 	if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
 #ifdef __FreeBSD__
 		zvol_remove_minors(zc->zc_name);
 #else
 		(void) zvol_remove_minor(zc->zc_name);
 #endif
 	return (err);
 }
 
 /*
  * fsname is name of dataset to rollback (to most recent snapshot)
  *
  * innvl is not used.
  *
  * outnvl: "target" -> name of most recent snapshot
  * }
  */
 /* ARGSUSED */
 static int
 zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl)
 {
 	zfsvfs_t *zfsvfs;
 	int error;
 
 	if (getzfsvfs(fsname, &zfsvfs) == 0) {
 		error = zfs_suspend_fs(zfsvfs);
 		if (error == 0) {
 			int resume_err;
 
 			error = dsl_dataset_rollback(fsname, zfsvfs, outnvl);
 			resume_err = zfs_resume_fs(zfsvfs, fsname);
 			error = error ? error : resume_err;
 		}
 		VFS_RELE(zfsvfs->z_vfs);
 	} else {
 		error = dsl_dataset_rollback(fsname, NULL, outnvl);
 	}
 	return (error);
 }
 
 static int
 recursive_unmount(const char *fsname, void *arg)
 {
 	const char *snapname = arg;
 	char fullname[MAXNAMELEN];
 
 	(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
 	return (zfs_unmount_snap(fullname));
 }
 
 /*
  * inputs:
  * zc_name	old name of dataset
  * zc_value	new name of dataset
  * zc_cookie	recursive flag (only valid for snapshots)
  *
  * outputs:	none
  */
 static int
 zfs_ioc_rename(zfs_cmd_t *zc)
 {
 	boolean_t recursive = zc->zc_cookie & 1;
 	char *at;
 	boolean_t allow_mounted = B_TRUE;
 
 #ifdef __FreeBSD__
 	allow_mounted = (zc->zc_cookie & 2) != 0;
 #endif
 
 	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '%'))
 		return (SET_ERROR(EINVAL));
 
 	at = strchr(zc->zc_name, '@');
 	if (at != NULL) {
 		/* snaps must be in same fs */
 		int error;
 
 		if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
 			return (SET_ERROR(EXDEV));
 		*at = '\0';
 		if (zc->zc_objset_type == DMU_OST_ZFS && allow_mounted) {
 			error = dmu_objset_find(zc->zc_name,
 			    recursive_unmount, at + 1,
 			    recursive ? DS_FIND_CHILDREN : 0);
 			if (error != 0) {
 				*at = '@';
 				return (error);
 			}
 		}
 		error = dsl_dataset_rename_snapshot(zc->zc_name,
 		    at + 1, strchr(zc->zc_value, '@') + 1, recursive);
 		*at = '@';
 
 		return (error);
 	} else {
 #ifdef illumos
 		if (zc->zc_objset_type == DMU_OST_ZVOL)
 			(void) zvol_remove_minor(zc->zc_name);
 #endif
 		return (dsl_dir_rename(zc->zc_name, zc->zc_value));
 	}
 }
 
 static int
 zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
 {
 	const char *propname = nvpair_name(pair);
 	boolean_t issnap = (strchr(dsname, '@') != NULL);
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	uint64_t intval;
 	int err;
 
 	if (prop == ZPROP_INVAL) {
 		if (zfs_prop_user(propname)) {
 			if (err = zfs_secpolicy_write_perms(dsname,
 			    ZFS_DELEG_PERM_USERPROP, cr))
 				return (err);
 			return (0);
 		}
 
 		if (!issnap && zfs_prop_userquota(propname)) {
 			const char *perm = NULL;
 			const char *uq_prefix =
 			    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
 			const char *gq_prefix =
 			    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
 
 			if (strncmp(propname, uq_prefix,
 			    strlen(uq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_USERQUOTA;
 			} else if (strncmp(propname, gq_prefix,
 			    strlen(gq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_GROUPQUOTA;
 			} else {
 				/* USERUSED and GROUPUSED are read-only */
 				return (SET_ERROR(EINVAL));
 			}
 
 			if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
 				return (err);
 			return (0);
 		}
 
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (issnap)
 		return (SET_ERROR(EINVAL));
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		/*
 		 * dsl_prop_get_all_impl() returns properties in this
 		 * format.
 		 */
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
 		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &pair) == 0);
 	}
 
 	/*
 	 * Check that this value is valid for this pool version
 	 */
 	switch (prop) {
 	case ZFS_PROP_COMPRESSION:
 		/*
 		 * If the user specified gzip compression, make sure
 		 * the SPA supports it. We ignore any errors here since
 		 * we'll catch them later.
 		 */
 		if (nvpair_value_uint64(pair, &intval) == 0) {
 			if (intval >= ZIO_COMPRESS_GZIP_1 &&
 			    intval <= ZIO_COMPRESS_GZIP_9 &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_GZIP_COMPRESSION)) {
 				return (SET_ERROR(ENOTSUP));
 			}
 
 			if (intval == ZIO_COMPRESS_ZLE &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_ZLE_COMPRESSION))
 				return (SET_ERROR(ENOTSUP));
 
 			if (intval == ZIO_COMPRESS_LZ4) {
 				spa_t *spa;
 
 				if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 					return (err);
 
 				if (!spa_feature_is_enabled(spa,
 				    SPA_FEATURE_LZ4_COMPRESS)) {
 					spa_close(spa, FTAG);
 					return (SET_ERROR(ENOTSUP));
 				}
 				spa_close(spa, FTAG);
 			}
 
 			/*
 			 * If this is a bootable dataset then
 			 * verify that the compression algorithm
 			 * is supported for booting. We must return
 			 * something other than ENOTSUP since it
 			 * implies a downrev pool version.
 			 */
 			if (zfs_is_bootfs(dsname) &&
 			    !BOOTFS_COMPRESS_VALID(intval)) {
 				return (SET_ERROR(ERANGE));
 			}
 		}
 		break;
 
 	case ZFS_PROP_COPIES:
 		if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
 			return (SET_ERROR(ENOTSUP));
 		break;
 
 	case ZFS_PROP_RECORDSIZE:
 		/* Record sizes above 128k need the feature to be enabled */
 		if (nvpair_value_uint64(pair, &intval) == 0 &&
 		    intval > SPA_OLD_MAXBLOCKSIZE) {
 			spa_t *spa;
 
 			/*
 			 * If this is a bootable dataset then
 			 * the we don't allow large (>128K) blocks,
 			 * because GRUB doesn't support them.
 			 */
 			if (zfs_is_bootfs(dsname) &&
 			    intval > SPA_OLD_MAXBLOCKSIZE) {
 				return (SET_ERROR(ERANGE));
 			}
 
 			/*
 			 * We don't allow setting the property above 1MB,
 			 * unless the tunable has been changed.
 			 */
 			if (intval > zfs_max_recordsize ||
 			    intval > SPA_MAXBLOCKSIZE)
 				return (SET_ERROR(ERANGE));
 
 			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 				return (err);
 
 			if (!spa_feature_is_enabled(spa,
 			    SPA_FEATURE_LARGE_BLOCKS)) {
 				spa_close(spa, FTAG);
 				return (SET_ERROR(ENOTSUP));
 			}
 			spa_close(spa, FTAG);
 		}
 		break;
 
 	case ZFS_PROP_SHARESMB:
 		if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
 			return (SET_ERROR(ENOTSUP));
 		break;
 
 	case ZFS_PROP_ACLINHERIT:
 		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
 		    nvpair_value_uint64(pair, &intval) == 0) {
 			if (intval == ZFS_ACL_PASSTHROUGH_X &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_PASSTHROUGH_X))
 				return (SET_ERROR(ENOTSUP));
 		}
 		break;
 
 	case ZFS_PROP_CHECKSUM:
 	case ZFS_PROP_DEDUP:
 	{
 		spa_feature_t feature;
 		spa_t *spa;
 
 		/* dedup feature version checks */
 		if (prop == ZFS_PROP_DEDUP &&
 		    zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
 			return (SET_ERROR(ENOTSUP));
 
 		if (nvpair_value_uint64(pair, &intval) != 0)
 			return (SET_ERROR(EINVAL));
 
 		/* check prop value is enabled in features */
 		feature = zio_checksum_to_feature(intval);
 		if (feature == SPA_FEATURE_NONE)
 			break;
 
 		if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 			return (err);
 		/*
 		 * Salted checksums are not supported on root pools.
 		 */
 		if (spa_bootfs(spa) != 0 &&
 		    intval < ZIO_CHECKSUM_FUNCTIONS &&
 		    (zio_checksum_table[intval].ci_flags &
 		    ZCHECKSUM_FLAG_SALTED)) {
 			spa_close(spa, FTAG);
 			return (SET_ERROR(ERANGE));
 		}
 		if (!spa_feature_is_enabled(spa, feature)) {
 			spa_close(spa, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 		spa_close(spa, FTAG);
 		break;
 	}
 	}
 
 	return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
 }
 
 /*
  * Checks for a race condition to make sure we don't increment a feature flag
  * multiple times.
  */
 static int
 zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_feature_t *featurep = arg;
 
 	if (!spa_feature_is_active(spa, *featurep))
 		return (0);
 	else
 		return (SET_ERROR(EBUSY));
 }
 
 /*
  * The callback invoked on feature activation in the sync task caused by
  * zfs_prop_activate_feature.
  */
 static void
 zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_feature_t *featurep = arg;
 
 	spa_feature_incr(spa, *featurep, tx);
 }
 
 /*
  * Activates a feature on a pool in response to a property setting. This
  * creates a new sync task which modifies the pool to reflect the feature
  * as being active.
  */
 static int
 zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
 {
 	int err;
 
 	/* EBUSY here indicates that the feature is already active */
 	err = dsl_sync_task(spa_name(spa),
 	    zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
 	    &feature, 2, ZFS_SPACE_CHECK_RESERVED);
 
 	if (err != 0 && err != EBUSY)
 		return (err);
 	else
 		return (0);
 }
 
 /*
  * Removes properties from the given props list that fail permission checks
  * needed to clear them and to restore them in case of a receive error. For each
  * property, make sure we have both set and inherit permissions.
  *
  * Returns the first error encountered if any permission checks fail. If the
  * caller provides a non-NULL errlist, it also gives the complete list of names
  * of all the properties that failed a permission check along with the
  * corresponding error numbers. The caller is responsible for freeing the
  * returned errlist.
  *
  * If every property checks out successfully, zero is returned and the list
  * pointed at by errlist is NULL.
  */
 static int
 zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
 {
 	zfs_cmd_t *zc;
 	nvpair_t *pair, *next_pair;
 	nvlist_t *errors;
 	int err, rv = 0;
 
 	if (props == NULL)
 		return (0);
 
 	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
 	(void) strcpy(zc->zc_name, dataset);
 	pair = nvlist_next_nvpair(props, NULL);
 	while (pair != NULL) {
 		next_pair = nvlist_next_nvpair(props, pair);
 
 		(void) strcpy(zc->zc_value, nvpair_name(pair));
 		if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
 		    (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
 			VERIFY(nvlist_remove_nvpair(props, pair) == 0);
 			VERIFY(nvlist_add_int32(errors,
 			    zc->zc_value, err) == 0);
 		}
 		pair = next_pair;
 	}
 	kmem_free(zc, sizeof (zfs_cmd_t));
 
 	if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
 		nvlist_free(errors);
 		errors = NULL;
 	} else {
 		VERIFY(nvpair_value_int32(pair, &rv) == 0);
 	}
 
 	if (errlist == NULL)
 		nvlist_free(errors);
 	else
 		*errlist = errors;
 
 	return (rv);
 }
 
 static boolean_t
 propval_equals(nvpair_t *p1, nvpair_t *p2)
 {
 	if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
 		/* dsl_prop_get_all_impl() format */
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
 		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &p1) == 0);
 	}
 
 	if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
 		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &p2) == 0);
 	}
 
 	if (nvpair_type(p1) != nvpair_type(p2))
 		return (B_FALSE);
 
 	if (nvpair_type(p1) == DATA_TYPE_STRING) {
 		char *valstr1, *valstr2;
 
 		VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
 		VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
 		return (strcmp(valstr1, valstr2) == 0);
 	} else {
 		uint64_t intval1, intval2;
 
 		VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
 		VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
 		return (intval1 == intval2);
 	}
 }
 
 /*
  * Remove properties from props if they are not going to change (as determined
  * by comparison with origprops). Remove them from origprops as well, since we
  * do not need to clear or restore properties that won't change.
  */
 static void
 props_reduce(nvlist_t *props, nvlist_t *origprops)
 {
 	nvpair_t *pair, *next_pair;
 
 	if (origprops == NULL)
 		return; /* all props need to be received */
 
 	pair = nvlist_next_nvpair(props, NULL);
 	while (pair != NULL) {
 		const char *propname = nvpair_name(pair);
 		nvpair_t *match;
 
 		next_pair = nvlist_next_nvpair(props, pair);
 
 		if ((nvlist_lookup_nvpair(origprops, propname,
 		    &match) != 0) || !propval_equals(pair, match))
 			goto next; /* need to set received value */
 
 		/* don't clear the existing received value */
 		(void) nvlist_remove_nvpair(origprops, match);
 		/* don't bother receiving the property */
 		(void) nvlist_remove_nvpair(props, pair);
 next:
 		pair = next_pair;
 	}
 }
 
 /*
  * Extract properties that cannot be set PRIOR to the receipt of a dataset.
  * For example, refquota cannot be set until after the receipt of a dataset,
  * because in replication streams, an older/earlier snapshot may exceed the
  * refquota.  We want to receive the older/earlier snapshot, but setting
  * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
  * the older/earlier snapshot from being received (with EDQUOT).
  *
  * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
  *
  * libzfs will need to be judicious handling errors encountered by props
  * extracted by this function.
  */
 static nvlist_t *
 extract_delay_props(nvlist_t *props)
 {
 	nvlist_t *delayprops;
 	nvpair_t *nvp, *tmp;
 	static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
 	int i;
 
 	VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
 	    nvp = nvlist_next_nvpair(props, nvp)) {
 		/*
 		 * strcmp() is safe because zfs_prop_to_name() always returns
 		 * a bounded string.
 		 */
 		for (i = 0; delayable[i] != 0; i++) {
 			if (strcmp(zfs_prop_to_name(delayable[i]),
 			    nvpair_name(nvp)) == 0) {
 				break;
 			}
 		}
 		if (delayable[i] != 0) {
 			tmp = nvlist_prev_nvpair(props, nvp);
 			VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
 			VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
 			nvp = tmp;
 		}
 	}
 
 	if (nvlist_empty(delayprops)) {
 		nvlist_free(delayprops);
 		delayprops = NULL;
 	}
 	return (delayprops);
 }
 
 #ifdef	DEBUG
 static boolean_t zfs_ioc_recv_inject_err;
 #endif
 
 /*
  * inputs:
  * zc_name		name of containing filesystem
  * zc_nvlist_src{_size}	nvlist of properties to apply
  * zc_value		name of snapshot to create
  * zc_string		name of clone origin (if DRR_FLAG_CLONE)
  * zc_cookie		file descriptor to recv from
  * zc_begin_record	the BEGIN record of the stream (not byteswapped)
  * zc_guid		force flag
  * zc_cleanup_fd	cleanup-on-exit file descriptor
  * zc_action_handle	handle for this guid/ds mapping (or zero on first call)
  * zc_resumable		if data is incomplete assume sender will resume
  *
  * outputs:
  * zc_cookie		number of bytes read
  * zc_nvlist_dst{_size} error for each unapplied received property
  * zc_obj		zprop_errflags_t
  * zc_action_handle	handle for this guid/ds mapping
  */
 static int
 zfs_ioc_recv(zfs_cmd_t *zc)
 {
 	file_t *fp;
 	dmu_recv_cookie_t drc;
 	boolean_t force = (boolean_t)zc->zc_guid;
 	int fd;
 	int error = 0;
 	int props_error = 0;
 	nvlist_t *errors;
 	offset_t off;
 	nvlist_t *props = NULL; /* sent properties */
 	nvlist_t *origprops = NULL; /* existing properties */
 	nvlist_t *delayprops = NULL; /* sent properties applied post-receive */
 	char *origin = NULL;
 	char *tosnap;
 	char tofs[ZFS_MAXNAMELEN];
 	cap_rights_t rights;
 	boolean_t first_recvd_props = B_FALSE;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL ||
 	    strchr(zc->zc_value, '%'))
 		return (SET_ERROR(EINVAL));
 
 	(void) strcpy(tofs, zc->zc_value);
 	tosnap = strchr(tofs, '@');
 	*tosnap++ = '\0';
 
 	if (zc->zc_nvlist_src != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props)) != 0)
 		return (error);
 
 	fd = zc->zc_cookie;
 #ifdef illumos
 	fp = getf(fd);
 #else
 	fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
 #endif
 	if (fp == NULL) {
 		nvlist_free(props);
 		return (SET_ERROR(EBADF));
 	}
 
 	errors = fnvlist_alloc();
 
 	if (zc->zc_string[0])
 		origin = zc->zc_string;
 
 	error = dmu_recv_begin(tofs, tosnap,
 	    &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Set properties before we receive the stream so that they are applied
 	 * to the new data. Note that we must call dmu_recv_stream() if
 	 * dmu_recv_begin() succeeds.
 	 */
 	if (props != NULL && !drc.drc_newfs) {
 		if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
 		    SPA_VERSION_RECVD_PROPS &&
 		    !dsl_prop_get_hasrecvd(tofs))
 			first_recvd_props = B_TRUE;
 
 		/*
 		 * If new received properties are supplied, they are to
 		 * completely replace the existing received properties, so stash
 		 * away the existing ones.
 		 */
 		if (dsl_prop_get_received(tofs, &origprops) == 0) {
 			nvlist_t *errlist = NULL;
 			/*
 			 * Don't bother writing a property if its value won't
 			 * change (and avoid the unnecessary security checks).
 			 *
 			 * The first receive after SPA_VERSION_RECVD_PROPS is a
 			 * special case where we blow away all local properties
 			 * regardless.
 			 */
 			if (!first_recvd_props)
 				props_reduce(props, origprops);
 			if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
 				(void) nvlist_merge(errors, errlist, 0);
 			nvlist_free(errlist);
 
 			if (clear_received_props(tofs, origprops,
 			    first_recvd_props ? NULL : props) != 0)
 				zc->zc_obj |= ZPROP_ERR_NOCLEAR;
 		} else {
 			zc->zc_obj |= ZPROP_ERR_NOCLEAR;
 		}
 	}
 
 	if (props != NULL) {
 		props_error = dsl_prop_set_hasrecvd(tofs);
 
 		if (props_error == 0) {
 			delayprops = extract_delay_props(props);
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
 			    props, errors);
 		}
 	}
 
 	off = fp->f_offset;
 	error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
 	    &zc->zc_action_handle);
 
 	if (error == 0) {
 		zfsvfs_t *zfsvfs = NULL;
 
 		if (getzfsvfs(tofs, &zfsvfs) == 0) {
 			/* online recv */
 			int end_err;
 
 			error = zfs_suspend_fs(zfsvfs);
 			/*
 			 * If the suspend fails, then the recv_end will
 			 * likely also fail, and clean up after itself.
 			 */
 			end_err = dmu_recv_end(&drc, zfsvfs);
 			if (error == 0)
 				error = zfs_resume_fs(zfsvfs, tofs);
 			error = error ? error : end_err;
 			VFS_RELE(zfsvfs->z_vfs);
 		} else {
 			error = dmu_recv_end(&drc, NULL);
 		}
 
 		/* Set delayed properties now, after we're done receiving. */
 		if (delayprops != NULL && error == 0) {
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
 			    delayprops, errors);
 		}
 	}
 
 	if (delayprops != NULL) {
 		/*
 		 * Merge delayed props back in with initial props, in case
 		 * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
 		 * we have to make sure clear_received_props() includes
 		 * the delayed properties).
 		 *
 		 * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
 		 * using ASSERT() will be just like a VERIFY.
 		 */
 		ASSERT(nvlist_merge(props, delayprops, 0) == 0);
 		nvlist_free(delayprops);
 	}
 
 	/*
 	 * Now that all props, initial and delayed, are set, report the prop
 	 * errors to the caller.
 	 */
 	if (zc->zc_nvlist_dst_size != 0 &&
 	    (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
 	    put_nvlist(zc, errors) != 0)) {
 		/*
 		 * Caller made zc->zc_nvlist_dst less than the minimum expected
 		 * size or supplied an invalid address.
 		 */
 		props_error = SET_ERROR(EINVAL);
 	}
 
 	zc->zc_cookie = off - fp->f_offset;
 	if (off >= 0 && off <= MAXOFFSET_T)
 		fp->f_offset = off;
 
 #ifdef	DEBUG
 	if (zfs_ioc_recv_inject_err) {
 		zfs_ioc_recv_inject_err = B_FALSE;
 		error = 1;
 	}
 #endif
 
 #ifdef __FreeBSD__
 	if (error == 0)
 		zvol_create_minors(tofs);
 #endif
 
 	/*
 	 * On error, restore the original props.
 	 */
 	if (error != 0 && props != NULL && !drc.drc_newfs) {
 		if (clear_received_props(tofs, props, NULL) != 0) {
 			/*
 			 * We failed to clear the received properties.
 			 * Since we may have left a $recvd value on the
 			 * system, we can't clear the $hasrecvd flag.
 			 */
 			zc->zc_obj |= ZPROP_ERR_NORESTORE;
 		} else if (first_recvd_props) {
 			dsl_prop_unset_hasrecvd(tofs);
 		}
 
 		if (origprops == NULL && !drc.drc_newfs) {
 			/* We failed to stash the original properties. */
 			zc->zc_obj |= ZPROP_ERR_NORESTORE;
 		}
 
 		/*
 		 * dsl_props_set() will not convert RECEIVED to LOCAL on or
 		 * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
 		 * explictly if we're restoring local properties cleared in the
 		 * first new-style receive.
 		 */
 		if (origprops != NULL &&
 		    zfs_set_prop_nvlist(tofs, (first_recvd_props ?
 		    ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
 		    origprops, NULL) != 0) {
 			/*
 			 * We stashed the original properties but failed to
 			 * restore them.
 			 */
 			zc->zc_obj |= ZPROP_ERR_NORESTORE;
 		}
 	}
 out:
 	nvlist_free(props);
 	nvlist_free(origprops);
 	nvlist_free(errors);
 	releasef(fd);
 
 	if (error == 0)
 		error = props_error;
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name	name of snapshot to send
  * zc_cookie	file descriptor to send stream to
  * zc_obj	fromorigin flag (mutually exclusive with zc_fromobj)
  * zc_sendobj	objsetid of snapshot to send
  * zc_fromobj	objsetid of incremental fromsnap (may be zero)
  * zc_guid	if set, estimate size of stream only.  zc_cookie is ignored.
  *		output size in zc_objset_type.
  * zc_flags	lzc_send_flags
  *
  * outputs:
  * zc_objset_type	estimated size, if zc_guid is set
  */
 static int
 zfs_ioc_send(zfs_cmd_t *zc)
 {
 	int error;
 	offset_t off;
 	boolean_t estimate = (zc->zc_guid != 0);
 	boolean_t embedok = (zc->zc_flags & 0x1);
 	boolean_t large_block_ok = (zc->zc_flags & 0x2);
 
 	if (zc->zc_obj != 0) {
 		dsl_pool_t *dp;
 		dsl_dataset_t *tosnap;
 
 		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 		if (error != 0)
 			return (error);
 
 		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
 		if (error != 0) {
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
 		if (dsl_dir_is_clone(tosnap->ds_dir))
 			zc->zc_fromobj =
 			    dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	}
 
 	if (estimate) {
 		dsl_pool_t *dp;
 		dsl_dataset_t *tosnap;
 		dsl_dataset_t *fromsnap = NULL;
 
 		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 		if (error != 0)
 			return (error);
 
 		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
 		if (error != 0) {
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
 		if (zc->zc_fromobj != 0) {
 			error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
 			    FTAG, &fromsnap);
 			if (error != 0) {
 				dsl_dataset_rele(tosnap, FTAG);
 				dsl_pool_rele(dp, FTAG);
 				return (error);
 			}
 		}
 
 		error = dmu_send_estimate(tosnap, fromsnap,
 		    &zc->zc_objset_type);
 
 		if (fromsnap != NULL)
 			dsl_dataset_rele(fromsnap, FTAG);
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	} else {
 		file_t *fp;
 		cap_rights_t rights;
 
 #ifdef illumos
 		fp = getf(zc->zc_cookie);
 #else
 		fget_write(curthread, zc->zc_cookie,
 		    cap_rights_init(&rights, CAP_WRITE), &fp);
 #endif
 		if (fp == NULL)
 			return (SET_ERROR(EBADF));
 
 		off = fp->f_offset;
 		error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
 		    zc->zc_fromobj, embedok, large_block_ok,
 #ifdef illumos
 		    zc->zc_cookie, fp->f_vnode, &off);
 #else
 		    zc->zc_cookie, fp, &off);
 #endif
 
 		if (off >= 0 && off <= MAXOFFSET_T)
 			fp->f_offset = off;
 		releasef(zc->zc_cookie);
 	}
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name	name of snapshot on which to report progress
  * zc_cookie	file descriptor of send stream
  *
  * outputs:
  * zc_cookie	number of bytes written in send stream thus far
  */
 static int
 zfs_ioc_send_progress(zfs_cmd_t *zc)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	dmu_sendarg_t *dsp = NULL;
 	int error;
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	mutex_enter(&ds->ds_sendstream_lock);
 
 	/*
 	 * Iterate over all the send streams currently active on this dataset.
 	 * If there's one which matches the specified file descriptor _and_ the
 	 * stream was started by the current process, return the progress of
 	 * that stream.
 	 */
 	for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
 	    dsp = list_next(&ds->ds_sendstreams, dsp)) {
 		if (dsp->dsa_outfd == zc->zc_cookie &&
 		    dsp->dsa_proc == curproc)
 			break;
 	}
 
 	if (dsp != NULL)
 		zc->zc_cookie = *(dsp->dsa_off);
 	else
 		error = SET_ERROR(ENOENT);
 
 	mutex_exit(&ds->ds_sendstream_lock);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_inject_fault(zfs_cmd_t *zc)
 {
 	int id, error;
 
 	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
 	    &zc->zc_inject_record);
 
 	if (error == 0)
 		zc->zc_guid = (uint64_t)id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear_fault(zfs_cmd_t *zc)
 {
 	return (zio_clear_fault((int)zc->zc_guid));
 }
 
 static int
 zfs_ioc_inject_list_next(zfs_cmd_t *zc)
 {
 	int id = (int)zc->zc_guid;
 	int error;
 
 	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
 	    &zc->zc_inject_record);
 
 	zc->zc_guid = id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_error_log(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	size_t count = (size_t)zc->zc_nvlist_dst_size;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
 	    &count);
 	if (error == 0)
 		zc->zc_nvlist_dst_size = count;
 	else
 		zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	vdev_t *vd;
 	int error;
 
 	/*
 	 * On zpool clear we also fix up missing slogs
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_lookup(zc->zc_name);
 	if (spa == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EIO));
 	}
 	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
 		/* we need to let spa_open/spa_load clear the chains */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	}
 	spa->spa_last_open_failed = 0;
 	mutex_exit(&spa_namespace_lock);
 
 	if (zc->zc_cookie & ZPOOL_NO_REWIND) {
 		error = spa_open(zc->zc_name, &spa, FTAG);
 	} else {
 		nvlist_t *policy;
 		nvlist_t *config = NULL;
 
 		if (zc->zc_nvlist_src == 0)
 			return (SET_ERROR(EINVAL));
 
 		if ((error = get_nvlist(zc->zc_nvlist_src,
 		    zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
 			error = spa_open_rewind(zc->zc_name, &spa, FTAG,
 			    policy, &config);
 			if (config != NULL) {
 				int err;
 
 				if ((err = put_nvlist(zc, config)) != 0)
 					error = err;
 				nvlist_free(config);
 			}
 			nvlist_free(policy);
 		}
 	}
 
 	if (error != 0)
 		return (error);
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if (zc->zc_guid == 0) {
 		vd = NULL;
 	} else {
 		vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
 		if (vd == NULL) {
 			(void) spa_vdev_state_exit(spa, NULL, ENODEV);
 			spa_close(spa, FTAG);
 			return (SET_ERROR(ENODEV));
 		}
 	}
 
 	vdev_clear(spa, vd);
 
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	/*
 	 * Resume any suspended I/Os.
 	 */
 	if (zio_resume(spa) != 0)
 		error = SET_ERROR(EIO);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_reopen(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	/*
 	 * If a resilver is already in progress then set the
 	 * spa_scrub_reopen flag to B_TRUE so that we don't restart
 	 * the scan as a side effect of the reopen. Otherwise, let
 	 * vdev_open() decided if a resilver is required.
 	 */
 	spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool);
 	vdev_reopen(spa->spa_root_vdev);
 	spa->spa_scrub_reopen = B_FALSE;
 
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 	spa_close(spa, FTAG);
 	return (0);
 }
 /*
  * inputs:
  * zc_name	name of filesystem
  * zc_value	name of origin snapshot
  *
  * outputs:
  * zc_string	name of conflicting snapshot, if there is one
  */
 static int
 zfs_ioc_promote(zfs_cmd_t *zc)
 {
 	char *cp;
 
 	/*
 	 * We don't need to unmount *all* the origin fs's snapshots, but
 	 * it's easier.
 	 */
 	cp = strchr(zc->zc_value, '@');
 	if (cp)
 		*cp = '\0';
 	(void) dmu_objset_find(zc->zc_value,
 	    zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
 	return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
 }
 
 /*
  * Retrieve a single {user|group}{used|quota}@... property.
  *
  * inputs:
  * zc_name	name of filesystem
  * zc_objset_type zfs_userquota_prop_t
  * zc_value	domain name (eg. "S-1-234-567-89")
  * zc_guid	RID/UID/GID
  *
  * outputs:
  * zc_cookie	property value
  */
 static int
 zfs_ioc_userspace_one(zfs_cmd_t *zc)
 {
 	zfsvfs_t *zfsvfs;
 	int error;
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
 		return (SET_ERROR(EINVAL));
 
 	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_userspace_one(zfsvfs,
 	    zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
 	zfsvfs_rele(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_objset_type	zfs_userquota_prop_t
  * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
  *
  * outputs:
  * zc_nvlist_dst[_size]	data buffer (array of zfs_useracct_t)
  * zc_cookie	zap cursor
  */
 static int
 zfs_ioc_userspace_many(zfs_cmd_t *zc)
 {
 	zfsvfs_t *zfsvfs;
 	int bufsize = zc->zc_nvlist_dst_size;
 
 	if (bufsize <= 0)
 		return (SET_ERROR(ENOMEM));
 
 	int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	void *buf = kmem_alloc(bufsize, KM_SLEEP);
 
 	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
 	    buf, &zc->zc_nvlist_dst_size);
 
 	if (error == 0) {
 		error = ddi_copyout(buf,
 		    (void *)(uintptr_t)zc->zc_nvlist_dst,
 		    zc->zc_nvlist_dst_size, zc->zc_iflags);
 	}
 	kmem_free(buf, bufsize);
 	zfsvfs_rele(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  *
  * outputs:
  * none
  */
 static int
 zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error = 0;
 	zfsvfs_t *zfsvfs;
 
 	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
 		if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
 			/*
 			 * If userused is not enabled, it may be because the
 			 * objset needs to be closed & reopened (to grow the
 			 * objset_phys_t).  Suspend/resume the fs will do that.
 			 */
 			error = zfs_suspend_fs(zfsvfs);
 			if (error == 0) {
 				dmu_objset_refresh_ownership(zfsvfs->z_os,
 				    zfsvfs);
 				error = zfs_resume_fs(zfsvfs, zc->zc_name);
 			}
 		}
 		if (error == 0)
 			error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
 		VFS_RELE(zfsvfs->z_vfs);
 	} else {
 		/* XXX kind of reading contents without owning */
 		error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 		if (error != 0)
 			return (error);
 
 		error = dmu_objset_userspace_upgrade(os);
 		dmu_objset_rele(os, FTAG);
 	}
 
 	return (error);
 }
 
 #ifdef illumos
 /*
  * We don't want to have a hard dependency
  * against some special symbols in sharefs
  * nfs, and smbsrv.  Determine them if needed when
  * the first file system is shared.
  * Neither sharefs, nfs or smbsrv are unloadable modules.
  */
 int (*znfsexport_fs)(void *arg);
 int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t);
 int (*zsmbexport_fs)(void *arg, boolean_t add_share);
 
 int zfs_nfsshare_inited;
 int zfs_smbshare_inited;
 
 ddi_modhandle_t nfs_mod;
 ddi_modhandle_t sharefs_mod;
 ddi_modhandle_t smbsrv_mod;
 #endif	/* illumos */
 kmutex_t zfs_share_lock;
 
 #ifdef illumos
 static int
 zfs_init_sharefs()
 {
 	int error;
 
 	ASSERT(MUTEX_HELD(&zfs_share_lock));
 	/* Both NFS and SMB shares also require sharetab support. */
 	if (sharefs_mod == NULL && ((sharefs_mod =
 	    ddi_modopen("fs/sharefs",
 	    KRTLD_MODE_FIRST, &error)) == NULL)) {
 		return (SET_ERROR(ENOSYS));
 	}
 	if (zshare_fs == NULL && ((zshare_fs =
 	    (int (*)(enum sharefs_sys_op, share_t *, uint32_t))
 	    ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
 		return (SET_ERROR(ENOSYS));
 	}
 	return (0);
 }
 #endif	/* illumos */
 
 static int
 zfs_ioc_share(zfs_cmd_t *zc)
 {
 #ifdef illumos
 	int error;
 	int opcode;
 
 	switch (zc->zc_share.z_sharetype) {
 	case ZFS_SHARE_NFS:
 	case ZFS_UNSHARE_NFS:
 		if (zfs_nfsshare_inited == 0) {
 			mutex_enter(&zfs_share_lock);
 			if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
 			    KRTLD_MODE_FIRST, &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			if (znfsexport_fs == NULL &&
 			    ((znfsexport_fs = (int (*)(void *))
 			    ddi_modsym(nfs_mod,
 			    "nfs_export", &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			error = zfs_init_sharefs();
 			if (error != 0) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			zfs_nfsshare_inited = 1;
 			mutex_exit(&zfs_share_lock);
 		}
 		break;
 	case ZFS_SHARE_SMB:
 	case ZFS_UNSHARE_SMB:
 		if (zfs_smbshare_inited == 0) {
 			mutex_enter(&zfs_share_lock);
 			if (smbsrv_mod == NULL && ((smbsrv_mod =
 			    ddi_modopen("drv/smbsrv",
 			    KRTLD_MODE_FIRST, &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			if (zsmbexport_fs == NULL && ((zsmbexport_fs =
 			    (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod,
 			    "smb_server_share", &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			error = zfs_init_sharefs();
 			if (error != 0) {
 				mutex_exit(&zfs_share_lock);
 				return (SET_ERROR(ENOSYS));
 			}
 			zfs_smbshare_inited = 1;
 			mutex_exit(&zfs_share_lock);
 		}
 		break;
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 
 	switch (zc->zc_share.z_sharetype) {
 	case ZFS_SHARE_NFS:
 	case ZFS_UNSHARE_NFS:
 		if (error =
 		    znfsexport_fs((void *)
 		    (uintptr_t)zc->zc_share.z_exportdata))
 			return (error);
 		break;
 	case ZFS_SHARE_SMB:
 	case ZFS_UNSHARE_SMB:
 		if (error = zsmbexport_fs((void *)
 		    (uintptr_t)zc->zc_share.z_exportdata,
 		    zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
 		    B_TRUE: B_FALSE)) {
 			return (error);
 		}
 		break;
 	}
 
 	opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS ||
 	    zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
 	    SHAREFS_ADD : SHAREFS_REMOVE;
 
 	/*
 	 * Add or remove share from sharetab
 	 */
 	error = zshare_fs(opcode,
 	    (void *)(uintptr_t)zc->zc_share.z_sharedata,
 	    zc->zc_share.z_sharemax);
 
 	return (error);
 
 #else	/* !illumos */
 	return (ENOSYS);
 #endif	/* illumos */
 }
 
 ace_t full_access[] = {
 	{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
 };
 
 /*
  * inputs:
  * zc_name		name of containing filesystem
  * zc_obj		object # beyond which we want next in-use object #
  *
  * outputs:
  * zc_obj		next in-use object #
  */
 static int
 zfs_ioc_next_obj(zfs_cmd_t *zc)
 {
 	objset_t *os = NULL;
 	int error;
 
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0)
 		return (error);
 
 	error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
 	    dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg);
 
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_value		prefix name for snapshot
  * zc_cleanup_fd	cleanup-on-exit file descriptor for calling process
  *
  * outputs:
  * zc_value		short name of new snapshot
  */
 static int
 zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
 {
 	char *snap_name;
 	char *hold_name;
 	int error;
 	minor_t minor;
 
 	error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
 	if (error != 0)
 		return (error);
 
 	snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
 	    (u_longlong_t)ddi_get_lbolt64());
 	hold_name = kmem_asprintf("%%%s", zc->zc_value);
 
 	error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
 	    hold_name);
 	if (error == 0)
 		(void) strcpy(zc->zc_value, snap_name);
 	strfree(snap_name);
 	strfree(hold_name);
 	zfs_onexit_fd_rele(zc->zc_cleanup_fd);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of "to" snapshot
  * zc_value		name of "from" snapshot
  * zc_cookie		file descriptor to write diff data on
  *
  * outputs:
  * dmu_diff_record_t's to the file descriptor
  */
 static int
 zfs_ioc_diff(zfs_cmd_t *zc)
 {
 	file_t *fp;
 	cap_rights_t rights;
 	offset_t off;
 	int error;
 
 #ifdef illumos
 	fp = getf(zc->zc_cookie);
 #else
 	fget_write(curthread, zc->zc_cookie,
 		    cap_rights_init(&rights, CAP_WRITE), &fp);
 #endif
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
 	off = fp->f_offset;
 
 #ifdef illumos
 	error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
 #else
 	error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
 #endif
 
 	if (off >= 0 && off <= MAXOFFSET_T)
 		fp->f_offset = off;
 	releasef(zc->zc_cookie);
 
 	return (error);
 }
 
 #ifdef illumos
 /*
  * Remove all ACL files in shares dir
  */
 static int
 zfs_smb_acl_purge(znode_t *dzp)
 {
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	int error;
 
 	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
 	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
 		    NULL, 0)) != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
 	return (error);
 }
 #endif	/* illumos */
 
 static int
 zfs_ioc_smb_acl(zfs_cmd_t *zc)
 {
 #ifdef illumos
 	vnode_t *vp;
 	znode_t *dzp;
 	vnode_t *resourcevp = NULL;
 	znode_t *sharedir;
 	zfsvfs_t *zfsvfs;
 	nvlist_t *nvlist;
 	char *src, *target;
 	vattr_t vattr;
 	vsecattr_t vsec;
 	int error = 0;
 
 	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
 	    NO_FOLLOW, NULL, &vp)) != 0)
 		return (error);
 
 	/* Now make sure mntpnt and dataset are ZFS */
 
 	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
 	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
 	    zc->zc_name) != 0)) {
 		VN_RELE(vp);
 		return (SET_ERROR(EINVAL));
 	}
 
 	dzp = VTOZ(vp);
 	zfsvfs = dzp->z_zfsvfs;
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Create share dir if its missing.
 	 */
 	mutex_enter(&zfsvfs->z_lock);
 	if (zfsvfs->z_shares_dir == 0) {
 		dmu_tx_t *tx;
 
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
 		    ZFS_SHARES_DIR);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error != 0) {
 			dmu_tx_abort(tx);
 		} else {
 			error = zfs_create_share_dir(zfsvfs, tx);
 			dmu_tx_commit(tx);
 		}
 		if (error != 0) {
 			mutex_exit(&zfsvfs->z_lock);
 			VN_RELE(vp);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 	mutex_exit(&zfsvfs->z_lock);
 
 	ASSERT(zfsvfs->z_shares_dir);
 	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
 		VN_RELE(vp);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	switch (zc->zc_cookie) {
 	case ZFS_SMB_ACL_ADD:
 		vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 		vattr.va_type = VREG;
 		vattr.va_mode = S_IFREG|0777;
 		vattr.va_uid = 0;
 		vattr.va_gid = 0;
 
 		vsec.vsa_mask = VSA_ACE;
 		vsec.vsa_aclentp = &full_access;
 		vsec.vsa_aclentsz = sizeof (full_access);
 		vsec.vsa_aclcnt = 1;
 
 		error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
 		    &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
 		if (resourcevp)
 			VN_RELE(resourcevp);
 		break;
 
 	case ZFS_SMB_ACL_REMOVE:
 		error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
 		    NULL, 0);
 		break;
 
 	case ZFS_SMB_ACL_RENAME:
 		if ((error = get_nvlist(zc->zc_nvlist_src,
 		    zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
 			VN_RELE(vp);
 			VN_RELE(ZTOV(sharedir));
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) ||
 		    nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
 		    &target)) {
 			VN_RELE(vp);
 			VN_RELE(ZTOV(sharedir));
 			ZFS_EXIT(zfsvfs);
 			nvlist_free(nvlist);
 			return (error);
 		}
 		error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
 		    kcred, NULL, 0);
 		nvlist_free(nvlist);
 		break;
 
 	case ZFS_SMB_ACL_PURGE:
 		error = zfs_smb_acl_purge(sharedir);
 		break;
 
 	default:
 		error = SET_ERROR(EINVAL);
 		break;
 	}
 
 	VN_RELE(vp);
 	VN_RELE(ZTOV(sharedir));
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 #else	/* !illumos */
 	return (EOPNOTSUPP);
 #endif	/* illumos */
 }
 
 /*
  * innvl: {
  *     "holds" -> { snapname -> holdname (string), ... }
  *     (optional) "cleanup_fd" -> fd (int32)
  * }
  *
  * outnvl: {
  *     snapname -> error value (int32)
  *     ...
  * }
  */
 /* ARGSUSED */
 static int
 zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 {
 	nvpair_t *pair;
 	nvlist_t *holds;
 	int cleanup_fd = -1;
 	int error;
 	minor_t minor = 0;
 
 	error = nvlist_lookup_nvlist(args, "holds", &holds);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	/* make sure the user didn't pass us any invalid (empty) tags */
 	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(holds, pair)) {
 		char *htag;
 
 		error = nvpair_value_string(pair, &htag);
 		if (error != 0)
 			return (SET_ERROR(error));
 
 		if (strlen(htag) == 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
 		error = zfs_onexit_fd_hold(cleanup_fd, &minor);
 		if (error != 0)
 			return (error);
 	}
 
 	error = dsl_dataset_user_hold(holds, minor, errlist);
 	if (minor != 0)
 		zfs_onexit_fd_rele(cleanup_fd);
 	return (error);
 }
 
 /*
  * innvl is not used.
  *
  * outnvl: {
  *    holdname -> time added (uint64 seconds since epoch)
  *    ...
  * }
  */
 /* ARGSUSED */
 static int
 zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
 {
 	return (dsl_dataset_get_holds(snapname, outnvl));
 }
 
 /*
  * innvl: {
  *     snapname -> { holdname, ... }
  *     ...
  * }
  *
  * outnvl: {
  *     snapname -> error value (int32)
  *     ...
  * }
  */
 /* ARGSUSED */
 static int
 zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
 {
 	return (dsl_dataset_user_release(holds, errlist));
 }
 
 /*
  * inputs:
  * zc_name		name of new filesystem or snapshot
  * zc_value		full name of old snapshot
  *
  * outputs:
  * zc_cookie		space in bytes
  * zc_objset_type	compressed space in bytes
  * zc_perm_action	uncompressed space in bytes
  */
 static int
 zfs_ioc_space_written(zfs_cmd_t *zc)
 {
 	int error;
 	dsl_pool_t *dp;
 	dsl_dataset_t *new, *old;
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 	error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
 	if (error != 0) {
 		dsl_dataset_rele(new, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = dsl_dataset_space_written(old, new, &zc->zc_cookie,
 	    &zc->zc_objset_type, &zc->zc_perm_action);
 	dsl_dataset_rele(old, FTAG);
 	dsl_dataset_rele(new, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 /*
  * innvl: {
  *     "firstsnap" -> snapshot name
  * }
  *
  * outnvl: {
  *     "used" -> space in bytes
  *     "compressed" -> compressed space in bytes
  *     "uncompressed" -> uncompressed space in bytes
  * }
  */
 static int
 zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error;
 	dsl_pool_t *dp;
 	dsl_dataset_t *new, *old;
 	char *firstsnap;
 	uint64_t used, comp, uncomp;
 
 	if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0)
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_pool_hold(lastsnap, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
 	if (error == 0 && !new->ds_is_snapshot) {
 		dsl_dataset_rele(new, FTAG);
 		error = SET_ERROR(EINVAL);
 	}
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 	error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
 	if (error == 0 && !old->ds_is_snapshot) {
 		dsl_dataset_rele(old, FTAG);
 		error = SET_ERROR(EINVAL);
 	}
 	if (error != 0) {
 		dsl_dataset_rele(new, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
 	dsl_dataset_rele(old, FTAG);
 	dsl_dataset_rele(new, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	fnvlist_add_uint64(outnvl, "used", used);
 	fnvlist_add_uint64(outnvl, "compressed", comp);
 	fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
 	return (error);
 }
 
 static int
 zfs_ioc_jail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
 	    (int)zc->zc_jailid));
 }
 
 static int
 zfs_ioc_unjail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
 	    (int)zc->zc_jailid));
 }
 
 /*
  * innvl: {
  *     "fd" -> file descriptor to write stream to (int32)
  *     (optional) "fromsnap" -> full snap name to send an incremental from
  *     (optional) "largeblockok" -> (value ignored)
  *         indicates that blocks > 128KB are permitted
  *     (optional) "embedok" -> (value ignored)
  *         presence indicates DRR_WRITE_EMBEDDED records are permitted
  *     (optional) "resume_object" and "resume_offset" -> (uint64)
  *         if present, resume send stream from specified object and offset.
  * }
  *
  * outnvl is unused
  */
 /* ARGSUSED */
 static int
 zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	cap_rights_t rights;
 	file_t *fp;
 	int error;
 	offset_t off;
 	char *fromname = NULL;
 	int fd;
 	boolean_t largeblockok;
 	boolean_t embedok;
 	uint64_t resumeobj = 0;
 	uint64_t resumeoff = 0;
 
 	error = nvlist_lookup_int32(innvl, "fd", &fd);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
 	largeblockok = nvlist_exists(innvl, "largeblockok");
 	embedok = nvlist_exists(innvl, "embedok");
 
 	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
 	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
 
 #ifdef illumos
 	file_t *fp = getf(fd);
 #else
 	fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
 #endif
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
 	off = fp->f_offset;
 	error = dmu_send(snapname, fromname, embedok, largeblockok, fd,
 #ifdef illumos
 	    resumeobj, resumeoff, fp->f_vnode, &off);
 #else
 	    resumeobj, resumeoff, fp, &off);
 #endif
 
 #ifdef illumos
 	if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
 		fp->f_offset = off;
 #else
 	fp->f_offset = off;
 #endif
 
 	releasef(fd);
 	return (error);
 }
 
 /*
  * Determine approximately how large a zfs send stream will be -- the number
  * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
  *
  * innvl: {
  *     (optional) "from" -> full snap or bookmark name to send an incremental
  *                          from
  * }
  *
  * outnvl: {
  *     "space" -> bytes of space (uint64)
  * }
  */
 static int
 zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *tosnap;
 	int error;
 	char *fromname;
 	uint64_t space;
 
 	error = dsl_pool_hold(snapname, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = nvlist_lookup_string(innvl, "from", &fromname);
 	if (error == 0) {
 		if (strchr(fromname, '@') != NULL) {
 			/*
 			 * If from is a snapshot, hold it and use the more
 			 * efficient dmu_send_estimate to estimate send space
 			 * size using deadlists.
 			 */
 			dsl_dataset_t *fromsnap;
 			error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
 			if (error != 0)
 				goto out;
 			error = dmu_send_estimate(tosnap, fromsnap, &space);
 			dsl_dataset_rele(fromsnap, FTAG);
 		} else if (strchr(fromname, '#') != NULL) {
 			/*
 			 * If from is a bookmark, fetch the creation TXG of the
 			 * snapshot it was created from and use that to find
 			 * blocks that were born after it.
 			 */
 			zfs_bookmark_phys_t frombm;
 
 			error = dsl_bookmark_lookup(dp, fromname, tosnap,
 			    &frombm);
 			if (error != 0)
 				goto out;
 			error = dmu_send_estimate_from_txg(tosnap,
 			    frombm.zbm_creation_txg, &space);
 		} else {
 			/*
 			 * from is not properly formatted as a snapshot or
 			 * bookmark
 			 */
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 	} else {
 		// If estimating the size of a full send, use dmu_send_estimate
 		error = dmu_send_estimate(tosnap, NULL, &space);
 	}
 
 	fnvlist_add_uint64(outnvl, "space", space);
 
 out:
 	dsl_dataset_rele(tosnap, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
 
 static void
 zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
     boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
 {
 	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
 
 	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
 	ASSERT3U(ioc, <, ZFS_IOC_LAST);
 	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
 	ASSERT3P(vec->zvec_func, ==, NULL);
 
 	vec->zvec_legacy_func = func;
 	vec->zvec_secpolicy = secpolicy;
 	vec->zvec_namecheck = namecheck;
 	vec->zvec_allow_log = log_history;
 	vec->zvec_pool_check = pool_check;
 }
 
 /*
  * See the block comment at the beginning of this file for details on
  * each argument to this function.
  */
 static void
 zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
     zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
     boolean_t allow_log)
 {
 	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
 
 	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
 	ASSERT3U(ioc, <, ZFS_IOC_LAST);
 	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
 	ASSERT3P(vec->zvec_func, ==, NULL);
 
 	/* if we are logging, the name must be valid */
 	ASSERT(!allow_log || namecheck != NO_NAME);
 
 	vec->zvec_name = name;
 	vec->zvec_func = func;
 	vec->zvec_secpolicy = secpolicy;
 	vec->zvec_namecheck = namecheck;
 	vec->zvec_pool_check = pool_check;
 	vec->zvec_smush_outnvlist = smush_outnvlist;
 	vec->zvec_allow_log = allow_log;
 }
 
 static void
 zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
     zfs_ioc_poolcheck_t pool_check)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    POOL_NAME, log_history, pool_check);
 }
 
 static void
 zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_FALSE, pool_check);
 }
 
 static void
 zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
 {
 	zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
 	    POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 }
 
 static void
 zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    NO_NAME, B_FALSE, POOL_CHECK_NONE);
 }
 
 static void
 zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
     zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
 }
 
 static void
 zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
 {
 	zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
 	    zfs_secpolicy_read);
 }
 
 static void
 zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 }
 
 static void
 zfs_ioctl_init(void)
 {
 	zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
 	    zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
 	    zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
 	    zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
 	    zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
 	    zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("create", ZFS_IOC_CREATE,
 	    zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("clone", ZFS_IOC_CLONE,
 	    zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
 	    zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("hold", ZFS_IOC_HOLD,
 	    zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 	zfs_ioctl_register("release", ZFS_IOC_RELEASE,
 	    zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
 	    zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
 	    zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
 
 	zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
 	    zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
 	    zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
 	zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
 	    zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
 	    POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
 	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
 	    zfs_ioc_pool_scan);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
 	    zfs_ioc_pool_upgrade);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
 	    zfs_ioc_vdev_add);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
 	    zfs_ioc_vdev_remove);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
 	    zfs_ioc_vdev_set_state);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
 	    zfs_ioc_vdev_attach);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
 	    zfs_ioc_vdev_detach);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
 	    zfs_ioc_vdev_setpath);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
 	    zfs_ioc_vdev_setfru);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
 	    zfs_ioc_pool_set_props);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
 	    zfs_ioc_vdev_split);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
 	    zfs_ioc_pool_reguid);
 
 	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
 	    zfs_ioc_pool_configs, zfs_secpolicy_none);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
 	    zfs_ioc_pool_tryimport, zfs_secpolicy_config);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
 	    zfs_ioc_inject_fault, zfs_secpolicy_inject);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
 	    zfs_ioc_clear_fault, zfs_secpolicy_inject);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
 	    zfs_ioc_inject_list_next, zfs_secpolicy_inject);
 
 	/*
 	 * pool destroy, and export don't log the history as part of
 	 * zfsdev_ioctl, but rather zfs_ioc_pool_export
 	 * does the logging of those commands.
 	 */
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
 	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
 	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
 	    zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
 	    zfs_ioc_dsobj_to_dsname,
 	    zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
 	    zfs_ioc_pool_get_history,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
 
 	zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
 	    zfs_ioc_space_written);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
 	    zfs_ioc_objset_recvd_props);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
 	    zfs_ioc_next_obj);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
 	    zfs_ioc_get_fsacl);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
 	    zfs_ioc_objset_stats);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
 	    zfs_ioc_objset_zplprops);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
 	    zfs_ioc_dataset_list_next);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	    zfs_ioc_snapshot_list_next);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
 	    zfs_ioc_send_progress);
 
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
 	    zfs_ioc_diff, zfs_secpolicy_diff);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
 	    zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
 	    zfs_ioc_obj_to_path, zfs_secpolicy_diff);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
 	    zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
 	    zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
 	    zfs_ioc_send, zfs_secpolicy_send);
 
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
 	    zfs_secpolicy_none);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
 	    zfs_secpolicy_destroy);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
 	    zfs_secpolicy_rename);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
 	    zfs_secpolicy_recv);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
 	    zfs_secpolicy_promote);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
 	    zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
 	    zfs_secpolicy_set_fsacl);
 
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
 	    zfs_secpolicy_share, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
 	    zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
 	    zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
 	    zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 
 #ifdef __FreeBSD__
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
 	    zfs_secpolicy_config, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
 	    zfs_secpolicy_config, POOL_CHECK_NONE);
 #endif
 }
 
 int
 pool_status_check(const char *name, zfs_ioc_namecheck_t type,
     zfs_ioc_poolcheck_t check)
 {
 	spa_t *spa;
 	int error;
 
 	ASSERT(type == POOL_NAME || type == DATASET_NAME);
 
 	if (check & POOL_CHECK_NONE)
 		return (0);
 
 	error = spa_open(name, &spa, FTAG);
 	if (error == 0) {
 		if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
 			error = SET_ERROR(EAGAIN);
 		else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
 			error = SET_ERROR(EROFS);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 /*
  * Find a free minor number.
  */
 minor_t
 zfsdev_minor_alloc(void)
 {
 	static minor_t last_minor;
 	minor_t m;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	for (m = last_minor + 1; m != last_minor; m++) {
 		if (m > ZFSDEV_MAX_MINOR)
 			m = 1;
 		if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
 			last_minor = m;
 			return (m);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zfs_ctldev_init(struct cdev *devp)
 {
 	minor_t minor;
 	zfs_soft_state_t *zs;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	minor = zfsdev_minor_alloc();
 	if (minor == 0)
 		return (SET_ERROR(ENXIO));
 
 	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
 		return (SET_ERROR(EAGAIN));
 
 	devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
 
 	zs = ddi_get_soft_state(zfsdev_state, minor);
 	zs->zss_type = ZSST_CTLDEV;
 	zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
 
 	return (0);
 }
 
 static void
 zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	zfs_onexit_destroy(zo);
 	ddi_soft_state_free(zfsdev_state, minor);
 }
 
 void *
 zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
 {
 	zfs_soft_state_t *zp;
 
 	zp = ddi_get_soft_state(zfsdev_state, minor);
 	if (zp == NULL || zp->zss_type != which)
 		return (NULL);
 
 	return (zp->zss_data);
 }
 
 static int
 zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
 {
 	int error = 0;
 
 #ifdef illumos
 	if (getminor(*devp) != 0)
 		return (zvol_open(devp, flag, otyp, cr));
 #endif
 
 	/* This is the control device. Allocate a new minor if requested. */
 	if (flag & FEXCL) {
 		mutex_enter(&spa_namespace_lock);
 		error = zfs_ctldev_init(devp);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	return (error);
 }
 
 static void
 zfsdev_close(void *data)
 {
 	zfs_onexit_t *zo;
 	minor_t minor = (minor_t)(uintptr_t)data;
 
 	if (minor == 0)
 		return;
 
 	mutex_enter(&spa_namespace_lock);
 	zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
 	if (zo == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return;
 	}
 	zfs_ctldev_destroy(zo, minor);
 	mutex_exit(&spa_namespace_lock);
 }
 
 static int
 zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
     struct thread *td)
 {
 	zfs_cmd_t *zc;
 	uint_t vecnum;
 	int error, rc, len;
 #ifdef illumos
 	minor_t minor = getminor(dev);
 #else
 	zfs_iocparm_t *zc_iocparm;
 	int cflag, cmd, oldvecnum;
 	boolean_t newioc, compat;
 	void *compat_zc = NULL;
 	cred_t *cr = td->td_ucred;
 #endif
 	const zfs_ioc_vec_t *vec;
 	char *saved_poolname = NULL;
 	nvlist_t *innvl = NULL;
 
 	cflag = ZFS_CMD_COMPAT_NONE;
 	compat = B_FALSE;
 	newioc = B_TRUE;	/* "new" style (zfs_iocparm_t) ioctl */
 
 	len = IOCPARM_LEN(zcmd);
 	vecnum = cmd = zcmd & 0xff;
 
 	/*
 	 * Check if we are talking to supported older binaries
 	 * and translate zfs_cmd if necessary
 	 */
 	if (len != sizeof(zfs_iocparm_t)) {
 		newioc = B_FALSE;
 		compat = B_TRUE;
 
 		vecnum = cmd;
 
 		switch (len) {
 		case sizeof(zfs_cmd_zcmd_t):
 			cflag = ZFS_CMD_COMPAT_LZC;
 			break;
 		case sizeof(zfs_cmd_deadman_t):
 			cflag = ZFS_CMD_COMPAT_DEADMAN;
 			break;
 		case sizeof(zfs_cmd_v28_t):
 			cflag = ZFS_CMD_COMPAT_V28;
 			break;
 		case sizeof(zfs_cmd_v15_t):
 			cflag = ZFS_CMD_COMPAT_V15;
 			vecnum = zfs_ioctl_v15_to_v28[cmd];
 
 			/*
 			 * Return without further handling
 			 * if the command is blacklisted.
 			 */
 			if (vecnum == ZFS_IOC_COMPAT_PASS)
 				return (0);
 			else if (vecnum == ZFS_IOC_COMPAT_FAIL)
 				return (ENOTSUP);
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 #ifdef illumos
 	vecnum = cmd - ZFS_IOC_FIRST;
 	ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
 #endif
 
 	if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
 		return (SET_ERROR(EINVAL));
 	vec = &zfs_ioc_vec[vecnum];
 
 	zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
 
 #ifdef illumos
 	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
 	if (error != 0) {
 		error = SET_ERROR(EFAULT);
 		goto out;
 	}
 #else	/* !illumos */
 	bzero(zc, sizeof(zfs_cmd_t));
 
 	if (newioc) {
 		zc_iocparm = (void *)arg;
 
 		switch (zc_iocparm->zfs_ioctl_version) {
 		case ZFS_IOCVER_CURRENT:
 			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
 				error = SET_ERROR(EINVAL);
 				goto out;
 			}
 			break;
 		case ZFS_IOCVER_EDBP:
 			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
 				error = SET_ERROR(EFAULT);
 				goto out;
 			}
 			compat = B_TRUE;
 			cflag = ZFS_CMD_COMPAT_EDBP;
 			break;
 		case ZFS_IOCVER_ZCMD:
 			if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
 			    zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
 				error = SET_ERROR(EFAULT);
 				goto out;
 			}
 			compat = B_TRUE;
 			cflag = ZFS_CMD_COMPAT_ZCMD;
 			break;
 		default:
 			error = SET_ERROR(EINVAL);
 			goto out;
 			/* NOTREACHED */
 		}
 
 		if (compat) {
 			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
 			compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
 			bzero(compat_zc, sizeof(zfs_cmd_t));
 
 			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
 			    compat_zc, zc_iocparm->zfs_cmd_size, flag);
 			if (error != 0) {
 				error = SET_ERROR(EFAULT);
 				goto out;
 			}
 		} else {
 			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
 			    zc, zc_iocparm->zfs_cmd_size, flag);
 			if (error != 0) {
 				error = SET_ERROR(EFAULT);
 				goto out;
 			}
 		}
 	}
 
 	if (compat) {
 		if (newioc) {
 			ASSERT(compat_zc != NULL);
 			zfs_cmd_compat_get(zc, compat_zc, cflag);
 		} else {
 			ASSERT(compat_zc == NULL);
 			zfs_cmd_compat_get(zc, arg, cflag);
 		}
 		oldvecnum = vecnum;
 		error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
 		if (error != 0)
 			goto out;
 		if (oldvecnum != vecnum)
 			vec = &zfs_ioc_vec[vecnum];
 	}
 #endif	/* !illumos */
 
 	zc->zc_iflags = flag & FKIOCTL;
 	if (zc->zc_nvlist_src_size != 0) {
 		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 		    zc->zc_iflags, &innvl);
 		if (error != 0)
 			goto out;
 	}
 
 	/* rewrite innvl for backwards compatibility */
 	if (compat)
 		innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag);
 
 	/*
 	 * Ensure that all pool/dataset names are valid before we pass down to
 	 * the lower layers.
 	 */
 	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
 	switch (vec->zvec_namecheck) {
 	case POOL_NAME:
 		if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
 			error = SET_ERROR(EINVAL);
 		else
 			error = pool_status_check(zc->zc_name,
 			    vec->zvec_namecheck, vec->zvec_pool_check);
 		break;
 
 	case DATASET_NAME:
 		if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
 			error = SET_ERROR(EINVAL);
 		else
 			error = pool_status_check(zc->zc_name,
 			    vec->zvec_namecheck, vec->zvec_pool_check);
 		break;
 
 	case NO_NAME:
 		break;
 	}
 
 	if (error == 0 && !(flag & FKIOCTL))
 		error = vec->zvec_secpolicy(zc, innvl, cr);
 
 	if (error != 0)
 		goto out;
 
 	/* legacy ioctls can modify zc_name */
 	len = strcspn(zc->zc_name, "/@#") + 1;
 	saved_poolname = kmem_alloc(len, KM_SLEEP);
 	(void) strlcpy(saved_poolname, zc->zc_name, len);
 
 	if (vec->zvec_func != NULL) {
 		nvlist_t *outnvl;
 		int puterror = 0;
 		spa_t *spa;
 		nvlist_t *lognv = NULL;
 
 		ASSERT(vec->zvec_legacy_func == NULL);
 
 		/*
 		 * Add the innvl to the lognv before calling the func,
 		 * in case the func changes the innvl.
 		 */
 		if (vec->zvec_allow_log) {
 			lognv = fnvlist_alloc();
 			fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
 			    vec->zvec_name);
 			if (!nvlist_empty(innvl)) {
 				fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
 				    innvl);
 			}
 		}
 
 		outnvl = fnvlist_alloc();
 		error = vec->zvec_func(zc->zc_name, innvl, outnvl);
 
 		if (error == 0 && vec->zvec_allow_log &&
 		    spa_open(zc->zc_name, &spa, FTAG) == 0) {
 			if (!nvlist_empty(outnvl)) {
 				fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
 				    outnvl);
 			}
 			(void) spa_history_log_nvl(spa, lognv);
 			spa_close(spa, FTAG);
 		}
 		fnvlist_free(lognv);
 
 		/* rewrite outnvl for backwards compatibility */
 		if (compat)
 			outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
 			    cflag);
 
 		if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
 			int smusherror = 0;
 			if (vec->zvec_smush_outnvlist) {
 				smusherror = nvlist_smush(outnvl,
 				    zc->zc_nvlist_dst_size);
 			}
 			if (smusherror == 0)
 				puterror = put_nvlist(zc, outnvl);
 		}
 
 		if (puterror != 0)
 			error = puterror;
 
 		nvlist_free(outnvl);
 	} else {
 		error = vec->zvec_legacy_func(zc);
 	}
 
 out:
 	nvlist_free(innvl);
 
 #ifdef illumos
 	rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
 	if (error == 0 && rc != 0)
 		error = SET_ERROR(EFAULT);
 #else
 	if (compat) {
 		zfs_ioctl_compat_post(zc, cmd, cflag);
 		if (newioc) {
 			ASSERT(compat_zc != NULL);
 			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
 
 			zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
 			rc = ddi_copyout(compat_zc,
 			    (void *)(uintptr_t)zc_iocparm->zfs_cmd,
 			    zc_iocparm->zfs_cmd_size, flag);
 			if (error == 0 && rc != 0)
 				error = SET_ERROR(EFAULT);
 			kmem_free(compat_zc, sizeof (zfs_cmd_t));
 		} else {
 			zfs_cmd_compat_put(zc, arg, vecnum, cflag);
 		}
 	} else {
 		ASSERT(newioc);
 
 		rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
 		    sizeof (zfs_cmd_t), flag);
 		if (error == 0 && rc != 0)
 			error = SET_ERROR(EFAULT);
 	}
 #endif
 	if (error == 0 && vec->zvec_allow_log) {
 		char *s = tsd_get(zfs_allow_log_key);
 		if (s != NULL)
 			strfree(s);
 		(void) tsd_set(zfs_allow_log_key, saved_poolname);
 	} else {
 		if (saved_poolname != NULL)
 			strfree(saved_poolname);
 	}
 
 	kmem_free(zc, sizeof (zfs_cmd_t));
 	return (error);
 }
 
 #ifdef illumos
 static int
 zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
 	if (cmd != DDI_ATTACH)
 		return (DDI_FAILURE);
 
 	if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
 	    DDI_PSEUDO, 0) == DDI_FAILURE)
 		return (DDI_FAILURE);
 
 	zfs_dip = dip;
 
 	ddi_report_dev(dip);
 
 	return (DDI_SUCCESS);
 }
 
 static int
 zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
 	if (spa_busy() || zfs_busy() || zvol_busy())
 		return (DDI_FAILURE);
 
 	if (cmd != DDI_DETACH)
 		return (DDI_FAILURE);
 
 	zfs_dip = NULL;
 
 	ddi_prop_remove_all(dip);
 	ddi_remove_minor_node(dip, NULL);
 
 	return (DDI_SUCCESS);
 }
 
 /*ARGSUSED*/
 static int
 zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 {
 	switch (infocmd) {
 	case DDI_INFO_DEVT2DEVINFO:
 		*result = zfs_dip;
 		return (DDI_SUCCESS);
 
 	case DDI_INFO_DEVT2INSTANCE:
 		*result = (void *)0;
 		return (DDI_SUCCESS);
 	}
 
 	return (DDI_FAILURE);
 }
 #endif	/* illumos */
 
 /*
  * OK, so this is a little weird.
  *
  * /dev/zfs is the control node, i.e. minor 0.
  * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
  *
  * /dev/zfs has basically nothing to do except serve up ioctls,
  * so most of the standard driver entry points are in zvol.c.
  */
 #ifdef illumos
 static struct cb_ops zfs_cb_ops = {
 	zfsdev_open,	/* open */
 	zfsdev_close,	/* close */
 	zvol_strategy,	/* strategy */
 	nodev,		/* print */
 	zvol_dump,	/* dump */
 	zvol_read,	/* read */
 	zvol_write,	/* write */
 	zfsdev_ioctl,	/* ioctl */
 	nodev,		/* devmap */
 	nodev,		/* mmap */
 	nodev,		/* segmap */
 	nochpoll,	/* poll */
 	ddi_prop_op,	/* prop_op */
 	NULL,		/* streamtab */
 	D_NEW | D_MP | D_64BIT,		/* Driver compatibility flag */
 	CB_REV,		/* version */
 	nodev,		/* async read */
 	nodev,		/* async write */
 };
 
 static struct dev_ops zfs_dev_ops = {
 	DEVO_REV,	/* version */
 	0,		/* refcnt */
 	zfs_info,	/* info */
 	nulldev,	/* identify */
 	nulldev,	/* probe */
 	zfs_attach,	/* attach */
 	zfs_detach,	/* detach */
 	nodev,		/* reset */
 	&zfs_cb_ops,	/* driver operations */
 	NULL,		/* no bus operations */
 	NULL,		/* power */
 	ddi_quiesce_not_needed,	/* quiesce */
 };
 
 static struct modldrv zfs_modldrv = {
 	&mod_driverops,
 	"ZFS storage pool",
 	&zfs_dev_ops
 };
 
 static struct modlinkage modlinkage = {
 	MODREV_1,
 	(void *)&zfs_modlfs,
 	(void *)&zfs_modldrv,
 	NULL
 };
 #endif	/* illumos */
 
 static struct cdevsw zfs_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	zfsdev_open,
 	.d_ioctl =	zfsdev_ioctl,
 	.d_name =	ZFS_DEV_NAME
 };
 
 static void
 zfs_allow_log_destroy(void *arg)
 {
 	char *poolname = arg;
 	strfree(poolname);
 }
 
 static void
 zfsdev_init(void)
 {
 	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
 	    ZFS_DEV_NAME);
 }
 
 static void
 zfsdev_fini(void)
 {
 	if (zfsdev != NULL)
 		destroy_dev(zfsdev);
 }
 
 static struct root_hold_token *zfs_root_token;
 struct proc *zfsproc;
 
 #ifdef illumos
 int
 _init(void)
 {
 	int error;
 
 	spa_init(FREAD | FWRITE);
 	zfs_init();
 	zvol_init();
 	zfs_ioctl_init();
 
 	if ((error = mod_install(&modlinkage)) != 0) {
 		zvol_fini();
 		zfs_fini();
 		spa_fini();
 		return (error);
 	}
 
 	tsd_create(&zfs_fsyncer_key, NULL);
 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
 
 	error = ldi_ident_from_mod(&modlinkage, &zfs_li);
 	ASSERT(error == 0);
 	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	return (0);
 }
 
 int
 _fini(void)
 {
 	int error;
 
 	if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
 		return (SET_ERROR(EBUSY));
 
 	if ((error = mod_remove(&modlinkage)) != 0)
 		return (error);
 
 	zvol_fini();
 	zfs_fini();
 	spa_fini();
 	if (zfs_nfsshare_inited)
 		(void) ddi_modclose(nfs_mod);
 	if (zfs_smbshare_inited)
 		(void) ddi_modclose(smbsrv_mod);
 	if (zfs_nfsshare_inited || zfs_smbshare_inited)
 		(void) ddi_modclose(sharefs_mod);
 
 	tsd_destroy(&zfs_fsyncer_key);
 	ldi_ident_release(zfs_li);
 	zfs_li = NULL;
 	mutex_destroy(&zfs_share_lock);
 
 	return (error);
 }
 
 int
 _info(struct modinfo *modinfop)
 {
 	return (mod_info(&modlinkage, modinfop));
 }
 #endif	/* illumos */
 
 static int zfs__init(void);
 static int zfs__fini(void);
 static void zfs_shutdown(void *, int);
 
 static eventhandler_tag zfs_shutdown_event_tag;
 
 #ifdef __FreeBSD__
 #define ZFS_MIN_KSTACK_PAGES 4
 #endif
 
 int
 zfs__init(void)
 {
 
 #ifdef __FreeBSD__
 #if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
 	printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
 	    "overflow panic!\nPlease consider adding "
 	    "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
 	    ZFS_MIN_KSTACK_PAGES);
 #endif
 #endif
 	zfs_root_token = root_mount_hold("ZFS");
 
 	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	spa_init(FREAD | FWRITE);
 	zfs_init();
 	zvol_init();
 	zfs_ioctl_init();
 
 	tsd_create(&zfs_fsyncer_key, NULL);
 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
 	tsd_create(&zfs_geom_probe_vdev_key, NULL);
 
 	printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
 	root_mount_rel(zfs_root_token);
 
 	zfsdev_init();
 
 	return (0);
 }
 
 int
 zfs__fini(void)
 {
 	if (spa_busy() || zfs_busy() || zvol_busy() ||
 	    zio_injection_enabled) {
 		return (EBUSY);
 	}
 
 	zfsdev_fini();
 	zvol_fini();
 	zfs_fini();
 	spa_fini();
 
 	tsd_destroy(&zfs_fsyncer_key);
 	tsd_destroy(&rrw_tsd_key);
 	tsd_destroy(&zfs_allow_log_key);
 
 	mutex_destroy(&zfs_share_lock);
 
 	return (0);
 }
 
 static void
 zfs_shutdown(void *arg __unused, int howto __unused)
 {
 
 	/*
 	 * ZFS fini routines can not properly work in a panic-ed system.
 	 */
 	if (panicstr == NULL)
 		(void)zfs__fini();
 }
 
 
 static int
 zfs_modevent(module_t mod, int type, void *unused __unused)
 {
 	int err;
 
 	switch (type) {
 	case MOD_LOAD:
 		err = zfs__init();
 		if (err == 0)
 			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
 			    shutdown_post_sync, zfs_shutdown, NULL,
 			    SHUTDOWN_PRI_FIRST);
 		return (err);
 	case MOD_UNLOAD:
 		err = zfs__fini();
 		if (err == 0 && zfs_shutdown_event_tag != NULL)
 			EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 			    zfs_shutdown_event_tag);
 		return (err);
 	case MOD_SHUTDOWN:
 		return (0);
 	default:
 		break;
 	}
 	return (EOPNOTSUPP);
 }
 
 static moduledata_t zfs_mod = {
 	"zfsctrl",
 	zfs_modevent,
 	0
 };
 DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(zfsctrl, 1);
 MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
 MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
 MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	(revision 296519)
@@ -1,680 +1,681 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/file.h>
 #include <sys/vfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/byteorder.h>
 #include <sys/policy.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/zfs_fuid.h>
 #include <sys/dsl_dataset.h>
 
 /*
  * These zfs_log_* functions must be called within a dmu tx, in one
  * of 2 contexts depending on zilog->z_replay:
  *
  * Non replay mode
  * ---------------
  * We need to record the transaction so that if it is committed to
  * the Intent Log then it can be replayed.  An intent log transaction
  * structure (itx_t) is allocated and all the information necessary to
  * possibly replay the transaction is saved in it. The itx is then assigned
  * a sequence number and inserted in the in-memory list anchored in the zilog.
  *
  * Replay mode
  * -----------
  * We need to mark the intent log record as replayed in the log header.
  * This is done in the same transaction as the replay so that they
  * commit atomically.
  */
 
 int
 zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
 {
 	int isxvattr = (vap->va_mask & AT_XVATTR);
 	switch (type) {
 	case Z_FILE:
 		if (vsecp == NULL && !isxvattr)
 			return (TX_CREATE);
 		if (vsecp && isxvattr)
 #ifdef TODO
 			return (TX_CREATE_ACL_ATTR);
 #else
 			panic("%s:%u: unsupported condition", __func__, __LINE__);
 #endif
 		if (vsecp)
 			return (TX_CREATE_ACL);
 		else
 			return (TX_CREATE_ATTR);
 		/*NOTREACHED*/
 	case Z_DIR:
 		if (vsecp == NULL && !isxvattr)
 			return (TX_MKDIR);
 		if (vsecp && isxvattr)
 #ifdef TODO
 			return (TX_MKDIR_ACL_ATTR);
 #else
 			panic("%s:%u: unsupported condition", __func__, __LINE__);
 #endif
 		if (vsecp)
 			return (TX_MKDIR_ACL);
 		else
 			return (TX_MKDIR_ATTR);
 	case Z_XATTRDIR:
 		return (TX_MKXATTR);
 	}
 	ASSERT(0);
 	return (TX_MAX_TYPE);
 }
 
 /*
  * build up the log data necessary for logging xvattr_t
  * First lr_attr_t is initialized.  following the lr_attr_t
  * is the mapsize and attribute bitmap copied from the xvattr_t.
  * Following the bitmap and bitmapsize two 64 bit words are reserved
  * for the create time which may be set.  Following the create time
  * records a single 64 bit integer which has the bits to set on
  * replay for the xvattr.
  */
 static void
 zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
 {
 	uint32_t	*bitmap;
 	uint64_t	*attrs;
 	uint64_t	*crtime;
 	xoptattr_t	*xoap;
 	void		*scanstamp;
 	int		i;
 
 	xoap = xva_getxoptattr(xvap);
 	ASSERT(xoap);
 
 	lrattr->lr_attr_masksize = xvap->xva_mapsize;
 	bitmap = &lrattr->lr_attr_bitmap;
 	for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
 		*bitmap = xvap->xva_reqattrmap[i];
 	}
 
 	/* Now pack the attributes up in a single uint64_t */
 	attrs = (uint64_t *)bitmap;
 	crtime = attrs + 1;
 	scanstamp = (caddr_t)(crtime + 2);
 	*attrs = 0;
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
 		*attrs |= (xoap->xoa_readonly == 0) ? 0 :
 		    XAT0_READONLY;
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
 		*attrs |= (xoap->xoa_hidden == 0) ? 0 :
 		    XAT0_HIDDEN;
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
 		*attrs |= (xoap->xoa_system == 0) ? 0 :
 		    XAT0_SYSTEM;
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
 		*attrs |= (xoap->xoa_archive == 0) ? 0 :
 		    XAT0_ARCHIVE;
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
 		*attrs |= (xoap->xoa_immutable == 0) ? 0 :
 		    XAT0_IMMUTABLE;
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
 		*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
 		    XAT0_NOUNLINK;
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
 		*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
 		    XAT0_APPENDONLY;
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
 		*attrs |= (xoap->xoa_opaque == 0) ? 0 :
 		    XAT0_APPENDONLY;
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
 		*attrs |= (xoap->xoa_nodump == 0) ? 0 :
 		    XAT0_NODUMP;
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
 		*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
 		    XAT0_AV_QUARANTINED;
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
 		*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
 		    XAT0_AV_MODIFIED;
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 		bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
 		*attrs |= (xoap->xoa_reparse == 0) ? 0 :
 		    XAT0_REPARSE;
 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
 		*attrs |= (xoap->xoa_offline == 0) ? 0 :
 		    XAT0_OFFLINE;
 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
 		*attrs |= (xoap->xoa_sparse == 0) ? 0 :
 		    XAT0_SPARSE;
 }
 
 static void *
 zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
 {
 	zfs_fuid_t *zfuid;
 	uint64_t *fuidloc = start;
 
 	/* First copy in the ACE FUIDs */
 	for (zfuid = list_head(&fuidp->z_fuids); zfuid;
 	    zfuid = list_next(&fuidp->z_fuids, zfuid)) {
 		*fuidloc++ = zfuid->z_logfuid;
 	}
 	return (fuidloc);
 }
 
 
 static void *
 zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
 {
 	zfs_fuid_domain_t *zdomain;
 
 	/* now copy in the domain info, if any */
 	if (fuidp->z_domain_str_sz != 0) {
 		for (zdomain = list_head(&fuidp->z_domains); zdomain;
 		    zdomain = list_next(&fuidp->z_domains, zdomain)) {
 			bcopy((void *)zdomain->z_domain, start,
 			    strlen(zdomain->z_domain) + 1);
 			start = (caddr_t)start +
 			    strlen(zdomain->z_domain) + 1;
 		}
 	}
 	return (start);
 }
 
 /*
  * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
  * TK_MKXATTR transactions.
  *
  * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
  * domain information appended prior to the name.  In this case the
  * uid/gid in the log record will be a log centric FUID.
  *
  * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
  * may contain attributes, ACL and optional fuid information.
  *
  * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
  * and ACL and normal users/groups in the ACEs.
  *
  * There may be an optional xvattr attribute information similar
  * to zfs_log_setattr.
  *
  * Also, after the file name "domain" strings may be appended.
  */
 void
 zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
     zfs_fuid_info_t *fuidp, vattr_t *vap)
 {
 	itx_t *itx;
 	lr_create_t *lr;
 	lr_acl_create_t *lracl;
 	size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0;
 	size_t xvatsize = 0;
 	size_t txsize;
 	xvattr_t *xvap = (xvattr_t *)vap;
 	void *end;
 	size_t lrsize;
 	size_t namesize = strlen(name) + 1;
 	size_t fuidsz = 0;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	/*
 	 * If we have FUIDs present then add in space for
 	 * domains and ACE fuid's if any.
 	 */
 	if (fuidp) {
 		fuidsz += fuidp->z_domain_str_sz;
 		fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
 	}
 
 	if (vap->va_mask & AT_XVATTR)
 		xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
 
 	if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
 	    (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
 	    (int)txtype == TX_MKXATTR) {
 		txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
 		lrsize = sizeof (*lr);
 	} else {
 		txsize =
 		    sizeof (lr_acl_create_t) + namesize + fuidsz +
 		    ZIL_ACE_LENGTH(aclsize) + xvatsize;
 		lrsize = sizeof (lr_acl_create_t);
 	}
 
 	itx = zil_itx_create(txtype, txsize);
 
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_foid = zp->z_id;
 	lr->lr_mode = zp->z_mode;
 	if (!IS_EPHEMERAL(zp->z_uid)) {
 		lr->lr_uid = (uint64_t)zp->z_uid;
 	} else {
 		lr->lr_uid = fuidp->z_fuid_owner;
 	}
 	if (!IS_EPHEMERAL(zp->z_gid)) {
 		lr->lr_gid = (uint64_t)zp->z_gid;
 	} else {
 		lr->lr_gid = fuidp->z_fuid_group;
 	}
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
 	    sizeof (uint64_t));
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
 	    lr->lr_crtime, sizeof (uint64_t) * 2);
 
 	if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
 	    sizeof (lr->lr_rdev)) != 0)
 		lr->lr_rdev = 0;
 
 	/*
 	 * Fill in xvattr info if any
 	 */
 	if (vap->va_mask & AT_XVATTR) {
 		zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
 		end = (caddr_t)lr + lrsize + xvatsize;
 	} else {
 		end = (caddr_t)lr + lrsize;
 	}
 
 	/* Now fill in any ACL info */
 
 	if (vsecp) {
 		lracl = (lr_acl_create_t *)&itx->itx_lr;
 		lracl->lr_aclcnt = vsecp->vsa_aclcnt;
 		lracl->lr_acl_bytes = aclsize;
 		lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
 		lracl->lr_fuidcnt  = fuidp ? fuidp->z_fuid_cnt : 0;
 		if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
 			lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
 		else
 			lracl->lr_acl_flags = 0;
 
 		bcopy(vsecp->vsa_aclentp, end, aclsize);
 		end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
 	}
 
 	/* drop in FUID info */
 	if (fuidp) {
 		end = zfs_log_fuid_ids(fuidp, end);
 		end = zfs_log_fuid_domains(fuidp, end);
 	}
 	/*
 	 * Now place file name in log record
 	 */
 	bcopy(name, end, namesize);
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * Handles both TX_REMOVE and TX_RMDIR transactions.
  */
 void
 zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, char *name, uint64_t foid)
 {
 	itx_t *itx;
 	lr_remove_t *lr;
 	size_t namesize = strlen(name) + 1;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_remove_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	bcopy(name, (char *)(lr + 1), namesize);
 
 	itx->itx_oid = foid;
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * Handles TX_LINK transactions.
  */
 void
 zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name)
 {
 	itx_t *itx;
 	lr_link_t *lr;
 	size_t namesize = strlen(name) + 1;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_link_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_link_obj = zp->z_id;
 	bcopy(name, (char *)(lr + 1), namesize);
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * Handles TX_SYMLINK transactions.
  */
 void
 zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name, char *link)
 {
 	itx_t *itx;
 	lr_create_t *lr;
 	size_t namesize = strlen(name) + 1;
 	size_t linksize = strlen(link) + 1;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_foid = zp->z_id;
 	lr->lr_uid = zp->z_uid;
 	lr->lr_gid = zp->z_gid;
 	lr->lr_mode = zp->z_mode;
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
 	    sizeof (uint64_t));
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
 	    lr->lr_crtime, sizeof (uint64_t) * 2);
 	bcopy(name, (char *)(lr + 1), namesize);
 	bcopy(link, (char *)(lr + 1) + namesize, linksize);
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * Handles TX_RENAME transactions.
  */
 void
 zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
 {
 	itx_t *itx;
 	lr_rename_t *lr;
 	size_t snamesize = strlen(sname) + 1;
 	size_t dnamesize = strlen(dname) + 1;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
 	lr = (lr_rename_t *)&itx->itx_lr;
 	lr->lr_sdoid = sdzp->z_id;
 	lr->lr_tdoid = tdzp->z_id;
 	bcopy(sname, (char *)(lr + 1), snamesize);
 	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
 	itx->itx_oid = szp->z_id;
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * Handles TX_WRITE transactions.
  */
 ssize_t zfs_immediate_write_sz = 32768;
 
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 {
 	itx_wr_state_t write_state;
 	boolean_t slogging;
 	uintptr_t fsync_cnt;
 	ssize_t immediate_write_sz;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 	    ? 0 : zfs_immediate_write_sz;
 
 	slogging = spa_has_slogs(zilog->zl_spa) &&
 	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
 	if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
 		write_state = WR_INDIRECT;
 	else if (ioflag & (FSYNC | FDSYNC))
 		write_state = WR_COPIED;
 	else
 		write_state = WR_NEED_COPY;
 
 	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
 		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
 	}
 
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
 		ssize_t len;
 
 		/*
 		 * If the write would overflow the largest block then split it.
 		 */
 		if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
 			len = SPA_OLD_MAXBLOCKSIZE >> 1;
 		else
 			len = resid;
 
 		itx = zil_itx_create(txtype, sizeof (*lr) +
 		    (write_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
 		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(txtype, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			write_state = WR_NEED_COPY;
 		}
 
 		itx->itx_wr_state = write_state;
 		if (write_state == WR_NEED_COPY)
 			itx->itx_sod += len;
 		lr->lr_foid = zp->z_id;
 		lr->lr_offset = off;
 		lr->lr_length = len;
 		lr->lr_blkoff = 0;
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zp->z_zfsvfs;
 
 		if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
 		    (fsync_cnt == 0))
 			itx->itx_sync = B_FALSE;
 
 		zil_itx_assign(zilog, itx, tx);
 
 		off += len;
 		resid -= len;
 	}
 }
 
 /*
  * Handles TX_TRUNCATE transactions.
  */
 void
 zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, uint64_t off, uint64_t len)
 {
 	itx_t *itx;
 	lr_truncate_t *lr;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
 	lr->lr_offset = off;
 	lr->lr_length = len;
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * Handles TX_SETATTR transactions.
  */
 void
 zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
 {
 	itx_t		*itx;
 	lr_setattr_t	*lr;
 	xvattr_t	*xvap = (xvattr_t *)vap;
 	size_t		recsize = sizeof (lr_setattr_t);
 	void		*start;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	/*
 	 * If XVATTR set, then log record size needs to allow
 	 * for lr_attr_t + xvattr mask, mapsize and create time
 	 * plus actual attribute values
 	 */
 	if (vap->va_mask & AT_XVATTR)
 		recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
 
 	if (fuidp)
 		recsize += fuidp->z_domain_str_sz;
 
 	itx = zil_itx_create(txtype, recsize);
 	lr = (lr_setattr_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
 	lr->lr_mask = (uint64_t)mask_applied;
 	lr->lr_mode = (uint64_t)vap->va_mode;
 	if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
 		lr->lr_uid = fuidp->z_fuid_owner;
 	else
 		lr->lr_uid = (uint64_t)vap->va_uid;
 
 	if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
 		lr->lr_gid = fuidp->z_fuid_group;
 	else
 		lr->lr_gid = (uint64_t)vap->va_gid;
 
 	lr->lr_size = (uint64_t)vap->va_size;
 	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
 	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
 	start = (lr_setattr_t *)(lr + 1);
 	if (vap->va_mask & AT_XVATTR) {
 		zfs_log_xvattr((lr_attr_t *)start, xvap);
 		start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
 	}
 
 	/*
 	 * Now stick on domain information if any on end
 	 */
 
 	if (fuidp)
 		(void) zfs_log_fuid_domains(fuidp, start);
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * Handles TX_ACL transactions.
  */
 void
 zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
     vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
 {
 	itx_t *itx;
 	lr_acl_v0_t *lrv0;
 	lr_acl_t *lr;
 	int txtype;
 	int lrsize;
 	size_t txsize;
 	size_t aclbytes = vsecp->vsa_aclentsz;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
 	    TX_ACL_V0 : TX_ACL;
 
 	if (txtype == TX_ACL)
 		lrsize = sizeof (*lr);
 	else
 		lrsize = sizeof (*lrv0);
 
 	txsize = lrsize +
 	    ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
 	    (fuidp ? fuidp->z_domain_str_sz : 0) +
 	    sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
 
 	itx = zil_itx_create(txtype, txsize);
 
 	lr = (lr_acl_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
 	if (txtype == TX_ACL) {
 		lr->lr_acl_bytes = aclbytes;
 		lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
 		lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
 		if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
 			lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
 		else
 			lr->lr_acl_flags = 0;
 	}
 	lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
 
 	if (txtype == TX_ACL_V0) {
 		lrv0 = (lr_acl_v0_t *)lr;
 		bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
 	} else {
 		void *start = (ace_t *)(lr + 1);
 
 		bcopy(vsecp->vsa_aclentp, start, aclbytes);
 
 		start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
 
 		if (fuidp) {
 			start = zfs_log_fuid_ids(fuidp, start);
 			(void) zfs_log_fuid_domains(fuidp, start);
 		}
 	}
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
 	zil_itx_assign(zilog, itx, tx);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 296519)
@@ -1,2512 +1,2513 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
 #include <sys/acl.h>
 #include <sys/vnode.h>
 #include <sys/vfs.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/cmn_err.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_boot.h>
 #include <sys/jail.h>
 #include "zfs_comutil.h"
 
 struct mtx zfs_debug_mtx;
 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
 
 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
 
 int zfs_super_owner;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
     "File system owner can perform privileged operation on his file systems");
 
 int zfs_debug_level;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
     "Debug level");
 
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
 static int zfs_version_acl = ZFS_ACL_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
     "ZFS_ACL_VERSION");
 static int zfs_version_spa = SPA_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
     "SPA_VERSION");
 static int zfs_version_zpl = ZPL_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
     "ZPL_VERSION");
 
 static int zfs_mount(vfs_t *vfsp);
 static int zfs_umount(vfs_t *vfsp, int fflag);
 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 static int zfs_sync(vfs_t *vfsp, int waitfor);
 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
     struct ucred **credanonp, int *numsecflavors, int **secflavors);
 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
 static void zfs_objset_close(zfsvfs_t *zfsvfs);
 static void zfs_freevfs(vfs_t *vfsp);
 
 static struct vfsops zfs_vfsops = {
 	.vfs_mount =		zfs_mount,
 	.vfs_unmount =		zfs_umount,
 	.vfs_root =		zfs_root,
 	.vfs_statfs =		zfs_statfs,
 	.vfs_vget =		zfs_vget,
 	.vfs_sync =		zfs_sync,
 	.vfs_checkexp =		zfs_checkexp,
 	.vfs_fhtovp =		zfs_fhtovp,
 };
 
 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
 
 /*
  * We need to keep a count of active fs's.
  * This is necessary to prevent our module
  * from being unloaded after a umount -f
  */
 static uint32_t	zfs_active_fs_count = 0;
 
 /*ARGSUSED*/
 static int
 zfs_sync(vfs_t *vfsp, int waitfor)
 {
 
 	/*
 	 * Data integrity is job one.  We don't want a compromised kernel
 	 * writing to the storage pool, so we never sync during panic.
 	 */
 	if (panicstr)
 		return (0);
 
 	/*
 	 * Ignore the system syncher.  ZFS already commits async data
 	 * at zfs_txg_timeout intervals.
 	 */
 	if (waitfor == MNT_LAZY)
 		return (0);
 
 	if (vfsp != NULL) {
 		/*
 		 * Sync a specific filesystem.
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 		dsl_pool_t *dp;
 		int error;
 
 		error = vfs_stdsync(vfsp, waitfor);
 		if (error != 0)
 			return (error);
 
 		ZFS_ENTER(zfsvfs);
 		dp = dmu_objset_pool(zfsvfs->z_os);
 
 		/*
 		 * If the system is shutting down, then skip any
 		 * filesystems which may exist on a suspended pool.
 		 */
 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, 0);
 
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
 		 * Sync all ZFS filesystems.  This is what happens when you
 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
 		 * request by waiting for all pools to commit all dirty data.
 		 */
 		spa_sync_allpools();
 	}
 
 	return (0);
 }
 
 #ifndef __FreeBSD_kernel__
 static int
 zfs_create_unique_device(dev_t *dev)
 {
 	major_t new_major;
 
 	do {
 		ASSERT3U(zfs_minor, <=, MAXMIN32);
 		minor_t start = zfs_minor;
 		do {
 			mutex_enter(&zfs_dev_mtx);
 			if (zfs_minor >= MAXMIN32) {
 				/*
 				 * If we're still using the real major
 				 * keep out of /dev/zfs and /dev/zvol minor
 				 * number space.  If we're using a getudev()'ed
 				 * major number, we can use all of its minors.
 				 */
 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 					zfs_minor = ZFS_MIN_MINOR;
 				else
 					zfs_minor = 0;
 			} else {
 				zfs_minor++;
 			}
 			*dev = makedevice(zfs_major, zfs_minor);
 			mutex_exit(&zfs_dev_mtx);
 		} while (vfs_devismounted(*dev) && zfs_minor != start);
 		if (zfs_minor == start) {
 			/*
 			 * We are using all ~262,000 minor numbers for the
 			 * current major number.  Create a new major number.
 			 */
 			if ((new_major = getudev()) == (major_t)-1) {
 				cmn_err(CE_WARN,
 				    "zfs_mount: Can't get unique major "
 				    "device number.");
 				return (-1);
 			}
 			mutex_enter(&zfs_dev_mtx);
 			zfs_major = new_major;
 			zfs_minor = 0;
 
 			mutex_exit(&zfs_dev_mtx);
 		} else {
 			break;
 		}
 		/* CONSTANTCONDITION */
 	} while (1);
 
 	return (0);
 }
 #endif	/* !__FreeBSD_kernel__ */
 
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		zfsvfs->z_atime = TRUE;
 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 	} else {
 		zfsvfs->z_atime = FALSE;
 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 	}
 }
 
 static void
 xattr_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 	}
 }
 
 static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
 	ASSERT(ISP2(newval));
 
 	zfsvfs->z_max_blksz = newval;
 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 }
 
 static void
 readonly_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval) {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 	}
 }
 
 static void
 setuid_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 	}
 }
 
 static void
 exec_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 	}
 }
 
 /*
  * The nbmand mount option can be changed at mount time.
  * We can't allow it to be toggled on live file systems or incorrect
  * behavior may be seen from cifs clients
  *
  * This property isn't registered via dsl_prop_register(), but this callback
  * will be called when a file system is first mounted
  */
 static void
 nbmand_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 	if (newval == FALSE) {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 	} else {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 	}
 }
 
 static void
 snapdir_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_show_ctldir = newval;
 }
 
 static void
 vscan_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_vscan = newval;
 }
 
 static void
 acl_mode_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_mode = newval;
 }
 
 static void
 acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_inherit = newval;
 }
 
 static int
 zfs_register_callbacks(vfs_t *vfsp)
 {
 	struct dsl_dataset *ds = NULL;
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
 	uint64_t nbmand;
 	boolean_t readonly = B_FALSE;
 	boolean_t do_readonly = B_FALSE;
 	boolean_t setuid = B_FALSE;
 	boolean_t do_setuid = B_FALSE;
 	boolean_t exec = B_FALSE;
 	boolean_t do_exec = B_FALSE;
 #ifdef illumos
 	boolean_t devices = B_FALSE;
 	boolean_t do_devices = B_FALSE;
 #endif
 	boolean_t xattr = B_FALSE;
 	boolean_t do_xattr = B_FALSE;
 	boolean_t atime = B_FALSE;
 	boolean_t do_atime = B_FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(zfsvfs);
 	os = zfsvfs->z_os;
 
 	/*
 	 * This function can be called for a snapshot when we update snapshot's
 	 * mount point, which isn't really supported.
 	 */
 	if (dmu_objset_is_snapshot(os))
 		return (EOPNOTSUPP);
 
 	/*
 	 * The act of registering our callbacks will destroy any mount
 	 * options we may have.  In order to enable temporary overrides
 	 * of mount options, we stash away the current values and
 	 * restore them after we register the callbacks.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
 		readonly = B_TRUE;
 		do_readonly = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 		readonly = B_FALSE;
 		do_readonly = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 		setuid = B_FALSE;
 		do_setuid = B_TRUE;
 	} else {
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 			setuid = B_FALSE;
 			do_setuid = B_TRUE;
 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 			setuid = B_TRUE;
 			do_setuid = B_TRUE;
 		}
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 		exec = B_FALSE;
 		do_exec = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 		exec = B_TRUE;
 		do_exec = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 		xattr = B_FALSE;
 		do_xattr = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 		xattr = B_TRUE;
 		do_xattr = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 		atime = B_FALSE;
 		do_atime = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 		atime = B_TRUE;
 		do_atime = B_TRUE;
 	}
 
 	/*
 	 * We need to enter pool configuration here, so that we can use
 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
 	 * dsl_prop_get_integer() can not be used, because it has to acquire
 	 * spa_namespace_lock and we can not do that because we already hold
 	 * z_teardown_lock.  The problem is that spa_config_sync() is called
 	 * with spa_namespace_lock held and the function calls ZFS vnode
 	 * operations to write the cache file and thus z_teardown_lock is
 	 * acquired after spa_namespace_lock.
 	 */
 	ds = dmu_objset_ds(os);
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 
 	/*
 	 * nbmand is a special property.  It can only be changed at
 	 * mount time.
 	 *
 	 * This is weird, but it is documented to only be changeable
 	 * at mount time.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 		nbmand = B_FALSE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 		nbmand = B_TRUE;
 	} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		return (error);
 	}
 
 	/*
 	 * Register property callbacks.
 	 *
 	 * It would probably be fine to just check for i/o error from
 	 * the first prop_register(), but I guess I like to go
 	 * overboard...
 	 */
 	error = dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 #ifdef illumos
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
 #endif
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 	    zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 	if (error)
 		goto unregister;
 
 	/*
 	 * Invoke our callbacks to restore temporary mount options.
 	 */
 	if (do_readonly)
 		readonly_changed_cb(zfsvfs, readonly);
 	if (do_setuid)
 		setuid_changed_cb(zfsvfs, setuid);
 	if (do_exec)
 		exec_changed_cb(zfsvfs, exec);
 	if (do_xattr)
 		xattr_changed_cb(zfsvfs, xattr);
 	if (do_atime)
 		atime_changed_cb(zfsvfs, atime);
 
 	nbmand_changed_cb(zfsvfs, nbmand);
 
 	return (0);
 
 unregister:
 	dsl_prop_unregister_all(ds, zfsvfs);
 	return (error);
 }
 
 static int
 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
     uint64_t *userp, uint64_t *groupp)
 {
 	/*
 	 * Is it a valid type of object to track?
 	 */
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (SET_ERROR(ENOENT));
 
 	/*
 	 * If we have a NULL data pointer
 	 * then assume the id's aren't changing and
 	 * return EEXIST to the dmu to let it know to
 	 * use the same ids
 	 */
 	if (data == NULL)
 		return (SET_ERROR(EEXIST));
 
 	if (bonustype == DMU_OT_ZNODE) {
 		znode_phys_t *znp = data;
 		*userp = znp->zp_uid;
 		*groupp = znp->zp_gid;
 	} else {
 		int hdrsize;
 		sa_hdr_phys_t *sap = data;
 		sa_hdr_phys_t sa = *sap;
 		boolean_t swap = B_FALSE;
 
 		ASSERT(bonustype == DMU_OT_SA);
 
 		if (sa.sa_magic == 0) {
 			/*
 			 * This should only happen for newly created
 			 * files that haven't had the znode data filled
 			 * in yet.
 			 */
 			*userp = 0;
 			*groupp = 0;
 			return (0);
 		}
 		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
 			sa.sa_magic = SA_MAGIC;
 			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
 			swap = B_TRUE;
 		} else {
 			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
 		}
 
 		hdrsize = sa_hdrsize(&sa);
 		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
 		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_UID_OFFSET));
 		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_GID_OFFSET));
 		if (swap) {
 			*userp = BSWAP_64(*userp);
 			*groupp = BSWAP_64(*groupp);
 		}
 	}
 	return (0);
 }
 
 static void
 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
     char *domainbuf, int buflen, uid_t *ridp)
 {
 	uint64_t fuid;
 	const char *domain;
 
 	fuid = strtonum(fuidstr, NULL);
 
 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 	if (domain)
 		(void) strlcpy(domainbuf, domain, buflen);
 	else
 		domainbuf[0] = '\0';
 	*ridp = FUID_RID(fuid);
 }
 
 static uint64_t
 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 {
 	switch (type) {
 	case ZFS_PROP_USERUSED:
 		return (DMU_USERUSED_OBJECT);
 	case ZFS_PROP_GROUPUSED:
 		return (DMU_GROUPUSED_OBJECT);
 	case ZFS_PROP_USERQUOTA:
 		return (zfsvfs->z_userquota_obj);
 	case ZFS_PROP_GROUPQUOTA:
 		return (zfsvfs->z_groupquota_obj);
 	}
 	return (0);
 }
 
 int
 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 {
 	int error;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zfs_useracct_t *buf = vbuf;
 	uint64_t obj;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0) {
 		*bufsizep = 0;
 		return (0);
 	}
 
 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 		    *bufsizep)
 			break;
 
 		fuidstr_to_sid(zfsvfs, za.za_name,
 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 
 		buf->zu_space = za.za_first_integer;
 		buf++;
 	}
 	if (error == ENOENT)
 		error = 0;
 
 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 	*cookiep = zap_cursor_serialize(&zc);
 	zap_cursor_fini(&zc);
 	return (error);
 }
 
 /*
  * buf must be big enough (eg, 32 bytes)
  */
 static int
 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
     char *buf, boolean_t addok)
 {
 	uint64_t fuid;
 	int domainid = 0;
 
 	if (domain && domain[0]) {
 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 		if (domainid == -1)
 			return (SET_ERROR(ENOENT));
 	}
 	fuid = FUID_ENCODE(domainid, rid);
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	return (0);
 }
 
 int
 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t *valp)
 {
 	char buf[32];
 	int err;
 	uint64_t obj;
 
 	*valp = 0;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0)
 		return (0);
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 	if (err)
 		return (err);
 
 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 	if (err == ENOENT)
 		err = 0;
 	return (err);
 }
 
 int
 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t quota)
 {
 	char buf[32];
 	int err;
 	dmu_tx_t *tx;
 	uint64_t *objp;
 	boolean_t fuid_dirtied;
 
 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 		return (SET_ERROR(EINVAL));
 
 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 		return (SET_ERROR(ENOTSUP));
 
 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 	    &zfsvfs->z_groupquota_obj;
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 	if (err)
 		return (err);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 	if (*objp == 0) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    zfs_userquota_prop_prefixes[type]);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	mutex_enter(&zfsvfs->z_lock);
 	if (*objp == 0) {
 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 		    DMU_OT_NONE, 0, tx);
 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 	}
 	mutex_exit(&zfsvfs->z_lock);
 
 	if (quota == 0) {
 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 		if (err == ENOENT)
 			err = 0;
 	} else {
 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 	}
 	ASSERT(err == 0);
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 	dmu_tx_commit(tx);
 	return (err);
 }
 
 boolean_t
 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 {
 	char buf[32];
 	uint64_t used, quota, usedobj, quotaobj;
 	int err;
 
 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 	if (err != 0)
 		return (B_FALSE);
 
 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 	if (err != 0)
 		return (B_FALSE);
 	return (used >= quota);
 }
 
 boolean_t
 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 {
 	uint64_t fuid;
 	uint64_t quotaobj;
 
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	fuid = isgroup ? zp->z_gid : zp->z_uid;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 }
 
 int
 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
 	zfsvfs_t *zfsvfs;
 	uint64_t zval;
 	int i, error;
 	uint64_t sa_obj;
 
 	/*
 	 * XXX: Fix struct statfs so this isn't necessary!
 	 *
 	 * The 'osname' is used as the filesystem's special node, which means
 	 * it must fit in statfs.f_mntfromname, or else it can't be
 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
 	 * 'zfs unmount' to think it's not mounted when it is.
 	 */
 	if (strlen(osname) >= MNAMELEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
 	/*
 	 * We claim to always be readonly so we can open snapshots;
 	 * other ZPL code will prevent us from writing to snapshots.
 	 */
 	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 	if (error) {
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 		return (error);
 	}
 
 	/*
 	 * Initialize the zfs-specific filesystem structure.
 	 * Should probably make this a kmem cache, shuffle fields,
 	 * and just bzero up to z_hold_mtx[].
 	 */
 	zfsvfs->z_vfs = NULL;
 	zfsvfs->z_parent = zfsvfs;
 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 	zfsvfs->z_os = os;
 
 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 	if (error) {
 		goto out;
 	} else if (zfsvfs->z_version >
 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 		(void) printf("Can't mount a version %lld file system "
 		    "on a version %lld pool\n. Pool must be upgraded to mount "
 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
 		error = SET_ERROR(ENOTSUP);
 		goto out;
 	}
 	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
 		goto out;
 	zfsvfs->z_norm = (int)zval;
 
 	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
 		goto out;
 	zfsvfs->z_utf8 = (zval != 0);
 
 	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
 		goto out;
 	zfsvfs->z_case = (uint_t)zval;
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 	    zfsvfs->z_case == ZFS_CASE_MIXED)
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 
 	if (zfsvfs->z_use_sa) {
 		/* should either have both of these objects or none */
 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 		    &sa_obj);
 		if (error)
 			goto out;
 	} else {
 		/*
 		 * Pre SA versions file systems should never touch
 		 * either the attribute registration or layout objects.
 		 */
 		sa_obj = 0;
 	}
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 	if (error)
 		goto out;
 
 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
 		sa_register_update_callback(os, zfs_sa_upgrade);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 	    &zfsvfs->z_root);
 	if (error)
 		goto out;
 	ASSERT(zfsvfs->z_root != 0);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 	    &zfsvfs->z_unlinkedobj);
 	if (error)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 	    8, 1, &zfsvfs->z_userquota_obj);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 	    8, 1, &zfsvfs->z_groupquota_obj);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 	    &zfsvfs->z_fuid_obj);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 	    &zfsvfs->z_shares_dir);
 	if (error && error != ENOENT)
 		goto out;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	*zfvp = zfsvfs;
 	return (0);
 
 out:
 	dmu_objset_disown(os, zfsvfs);
 	*zfvp = NULL;
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 	return (error);
 }
 
 static int
 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 {
 	int error;
 
 	error = zfs_register_callbacks(zfsvfs->z_vfs);
 	if (error)
 		return (error);
 
 	/*
 	 * Set the objset user_ptr to track its zfsvfs.
 	 */
 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 
 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 
 	/*
 	 * If we are not mounting (ie: online recv), then we don't
 	 * have to worry about replaying the log as we blocked all
 	 * operations out since we closed the ZIL.
 	 */
 	if (mounting) {
 		boolean_t readonly;
 
 		/*
 		 * During replay we remove the read only flag to
 		 * allow replays to succeed.
 		 */
 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
 		if (readonly != 0)
 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		else
 			zfs_unlinked_drain(zfsvfs);
 
 		/*
 		 * Parse and replay the intent log.
 		 *
 		 * Because of ziltest, this must be done after
 		 * zfs_unlinked_drain().  (Further note: ziltest
 		 * doesn't use readonly mounts, where
 		 * zfs_unlinked_drain() isn't called.)  This is because
 		 * ziltest causes spa_sync() to think it's committed,
 		 * but actually it is not, so the intent log contains
 		 * many txg's worth of changes.
 		 *
 		 * In particular, if object N is in the unlinked set in
 		 * the last txg to actually sync, then it could be
 		 * actually freed in a later txg and then reallocated
 		 * in a yet later txg.  This would write a "create
 		 * object N" record to the intent log.  Normally, this
 		 * would be fine because the spa_sync() would have
 		 * written out the fact that object N is free, before
 		 * we could write the "create object N" intent log
 		 * record.
 		 *
 		 * But when we are in ziltest mode, we advance the "open
 		 * txg" without actually spa_sync()-ing the changes to
 		 * disk.  So we would see that object N is still
 		 * allocated and in the unlinked set, and there is an
 		 * intent log record saying to allocate it.
 		 */
 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
 			if (zil_replay_disable) {
 				zil_destroy(zfsvfs->z_log, B_FALSE);
 			} else {
 				zfsvfs->z_replay = B_TRUE;
 				zil_replay(zfsvfs->z_os, zfsvfs,
 				    zfs_replay_vector);
 				zfsvfs->z_replay = B_FALSE;
 			}
 		}
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
 	return (0);
 }
 
 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
 
 void
 zfsvfs_free(zfsvfs_t *zfsvfs)
 {
 	int i;
 
 	/*
 	 * This is a barrier to prevent the filesystem from going away in
 	 * zfs_znode_move() until we can safely ensure that the filesystem is
 	 * not unmounted. We consider the filesystem valid before the barrier
 	 * and invalid after the barrier.
 	 */
 	rw_enter(&zfsvfs_lock, RW_READER);
 	rw_exit(&zfsvfs_lock);
 
 	zfs_fuid_destroy(zfsvfs);
 
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	mutex_destroy(&zfsvfs->z_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
 	rrm_destroy(&zfsvfs->z_teardown_lock);
 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	rw_destroy(&zfsvfs->z_fuid_lock);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 static void
 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
 {
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	if (zfsvfs->z_vfs) {
 		if (zfsvfs->z_use_fuids) {
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		} else {
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		}
 	}
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
 	uint64_t recordsize, fsid_guid;
 	int error = 0;
 	zfsvfs_t *zfsvfs;
 	vnode_t *vp;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
 
 	error = zfsvfs_create(osname, &zfsvfs);
 	if (error)
 		return (error);
 	zfsvfs->z_vfs = vfsp;
 
 #ifdef illumos
 	/* Initialize the generic filesystem structure. */
 	vfsp->vfs_bcount = 0;
 	vfsp->vfs_data = NULL;
 
 	if (zfs_create_unique_device(&mount_dev) == -1) {
 		error = SET_ERROR(ENODEV);
 		goto out;
 	}
 	ASSERT(vfs_devismounted(mount_dev) == 0);
 #endif
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
 		goto out;
 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
 
 	vfsp->vfs_data = zfsvfs;
 	vfsp->mnt_flag |= MNT_LOCAL;
 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
 
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
 	 * separates our fsid from any other filesystem types, and a
 	 * 56-bit objset unique ID.  The objset unique ID is unique to
 	 * all objsets open on this system, provided by unique_create().
 	 * The 8-bit fs type must be put in the low bits of fsid[1]
 	 * because that's where other Solaris filesystems put it.
 	 */
 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
 	vfsp->vfs_fsid.val[0] = fsid_guid;
 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
 	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
 
 	/*
 	 * Set features for file system.
 	 */
 	zfs_set_fuid_feature(zfsvfs);
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 	}
 	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
 
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t pval;
 
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
 			goto out;
 		xattr_changed_cb(zfsvfs, pval);
 		zfsvfs->z_issnap = B_TRUE;
 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
 
 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 	} else {
 		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
 
 	vfs_mountedfrom(vfsp, osname);
 
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
 out:
 	if (error) {
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	} else {
 		atomic_inc_32(&zfs_active_fs_count);
 	}
 
 	return (error);
 }
 
 void
 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 {
 	objset_t *os = zfsvfs->z_os;
 
 	if (!dmu_objset_is_snapshot(os))
 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
 }
 
 #ifdef SECLABEL
 /*
  * Convert a decimal digit string to a uint64_t integer.
  */
 static int
 str_to_uint64(char *str, uint64_t *objnum)
 {
 	uint64_t num = 0;
 
 	while (*str) {
 		if (*str < '0' || *str > '9')
 			return (SET_ERROR(EINVAL));
 
 		num = num*10 + *str++ - '0';
 	}
 
 	*objnum = num;
 	return (0);
 }
 
 /*
  * The boot path passed from the boot loader is in the form of
  * "rootpool-name/root-filesystem-object-number'. Convert this
  * string to a dataset name: "rootpool-name/root-filesystem-name".
  */
 static int
 zfs_parse_bootfs(char *bpath, char *outpath)
 {
 	char *slashp;
 	uint64_t objnum;
 	int error;
 
 	if (*bpath == 0 || *bpath == '/')
 		return (SET_ERROR(EINVAL));
 
 	(void) strcpy(outpath, bpath);
 
 	slashp = strchr(bpath, '/');
 
 	/* if no '/', just return the pool name */
 	if (slashp == NULL) {
 		return (0);
 	}
 
 	/* if not a number, just return the root dataset name */
 	if (str_to_uint64(slashp+1, &objnum)) {
 		return (0);
 	}
 
 	*slashp = '\0';
 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
 	*slashp = '/';
 
 	return (error);
 }
 
 /*
  * Check that the hex label string is appropriate for the dataset being
  * mounted into the global_zone proper.
  *
  * Return an error if the hex label string is not default or
  * admin_low/admin_high.  For admin_low labels, the corresponding
  * dataset must be readonly.
  */
 int
 zfs_check_global_label(const char *dsname, const char *hexsl)
 {
 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
 		/* must be readonly */
 		uint64_t rdonly;
 
 		if (dsl_prop_get_integer(dsname,
 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
 			return (SET_ERROR(EACCES));
 		return (rdonly ? 0 : EACCES);
 	}
 	return (SET_ERROR(EACCES));
 }
 
 /*
  * Determine whether the mount is allowed according to MAC check.
  * by comparing (where appropriate) label of the dataset against
  * the label of the zone being mounted into.  If the dataset has
  * no label, create one.
  *
  * Returns 0 if access allowed, error otherwise (e.g. EACCES)
  */
 static int
 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
 {
 	int		error, retv;
 	zone_t		*mntzone = NULL;
 	ts_label_t	*mnt_tsl;
 	bslabel_t	*mnt_sl;
 	bslabel_t	ds_sl;
 	char		ds_hexsl[MAXNAMELEN];
 
 	retv = EACCES;				/* assume the worst */
 
 	/*
 	 * Start by getting the dataset label if it exists.
 	 */
 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
 	if (error)
 		return (SET_ERROR(EACCES));
 
 	/*
 	 * If labeling is NOT enabled, then disallow the mount of datasets
 	 * which have a non-default label already.  No other label checks
 	 * are needed.
 	 */
 	if (!is_system_labeled()) {
 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 			return (0);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Get the label of the mountpoint.  If mounting into the global
 	 * zone (i.e. mountpoint is not within an active zone and the
 	 * zoned property is off), the label must be default or
 	 * admin_low/admin_high only; no other checks are needed.
 	 */
 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
 	if (mntzone->zone_id == GLOBAL_ZONEID) {
 		uint64_t zoned;
 
 		zone_rele(mntzone);
 
 		if (dsl_prop_get_integer(osname,
 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
 			return (SET_ERROR(EACCES));
 		if (!zoned)
 			return (zfs_check_global_label(osname, ds_hexsl));
 		else
 			/*
 			 * This is the case of a zone dataset being mounted
 			 * initially, before the zone has been fully created;
 			 * allow this mount into global zone.
 			 */
 			return (0);
 	}
 
 	mnt_tsl = mntzone->zone_slabel;
 	ASSERT(mnt_tsl != NULL);
 	label_hold(mnt_tsl);
 	mnt_sl = label2bslabel(mnt_tsl);
 
 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
 		/*
 		 * The dataset doesn't have a real label, so fabricate one.
 		 */
 		char *str = NULL;
 
 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
 		    dsl_prop_set_string(osname,
 		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 		    ZPROP_SRC_LOCAL, str) == 0)
 			retv = 0;
 		if (str != NULL)
 			kmem_free(str, strlen(str) + 1);
 	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
 		/*
 		 * Now compare labels to complete the MAC check.  If the
 		 * labels are equal then allow access.  If the mountpoint
 		 * label dominates the dataset label, allow readonly access.
 		 * Otherwise, access is denied.
 		 */
 		if (blequal(mnt_sl, &ds_sl))
 			retv = 0;
 		else if (bldominates(mnt_sl, &ds_sl)) {
 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
 			retv = 0;
 		}
 	}
 
 	label_rele(mnt_tsl);
 	zone_rele(mntzone);
 	return (retv);
 }
 #endif	/* SECLABEL */
 
 #ifdef OPENSOLARIS_MOUNTROOT
 static int
 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
 {
 	int error = 0;
 	static int zfsrootdone = 0;
 	zfsvfs_t *zfsvfs = NULL;
 	znode_t *zp = NULL;
 	vnode_t *vp = NULL;
 	char *zfs_bootfs;
 	char *zfs_devid;
 
 	ASSERT(vfsp);
 
 	/*
 	 * The filesystem that we mount as root is defined in the
 	 * boot property "zfs-bootfs" with a format of
 	 * "poolname/root-dataset-objnum".
 	 */
 	if (why == ROOT_INIT) {
 		if (zfsrootdone++)
 			return (SET_ERROR(EBUSY));
 		/*
 		 * the process of doing a spa_load will require the
 		 * clock to be set before we could (for example) do
 		 * something better by looking at the timestamp on
 		 * an uberblock, so just set it to -1.
 		 */
 		clkset(-1);
 
 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
 			    "bootfs name");
 			return (SET_ERROR(EINVAL));
 		}
 		zfs_devid = spa_get_bootprop("diskdevid");
 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
 		if (zfs_devid)
 			spa_free_bootprop(zfs_devid);
 		if (error) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
 			    error);
 			return (error);
 		}
 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
 			    error);
 			return (error);
 		}
 
 		spa_free_bootprop(zfs_bootfs);
 
 		if (error = vfs_lock(vfsp))
 			return (error);
 
 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
 			goto out;
 		}
 
 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
 		ASSERT(zfsvfs);
 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
 			goto out;
 		}
 
 		vp = ZTOV(zp);
 		mutex_enter(&vp->v_lock);
 		vp->v_flag |= VROOT;
 		mutex_exit(&vp->v_lock);
 		rootvp = vp;
 
 		/*
 		 * Leave rootvp held.  The root file system is never unmounted.
 		 */
 
 		vfs_add((struct vnode *)0, vfsp,
 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
 out:
 		vfs_unlock(vfsp);
 		return (error);
 	} else if (why == ROOT_REMOUNT) {
 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
 		vfsp->vfs_flag |= VFS_REMOUNT;
 
 		/* refresh mount options */
 		zfs_unregister_callbacks(vfsp->vfs_data);
 		return (zfs_register_callbacks(vfsp));
 
 	} else if (why == ROOT_UNMOUNT) {
 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
 		(void) zfs_sync(vfsp, 0, 0);
 		return (0);
 	}
 
 	/*
 	 * if "why" is equal to anything else other than ROOT_INIT,
 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
 	 */
 	return (SET_ERROR(ENOTSUP));
 }
 #endif	/* OPENSOLARIS_MOUNTROOT */
 
 static int
 getpoolname(const char *osname, char *poolname)
 {
 	char *p;
 
 	p = strchr(osname, '/');
 	if (p == NULL) {
 		if (strlen(osname) >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strcpy(poolname, osname);
 	} else {
 		if (p - osname >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strncpy(poolname, osname, p - osname);
 		poolname[p - osname] = '\0';
 	}
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_mount(vfs_t *vfsp)
 {
 	kthread_t	*td = curthread;
 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
 	cred_t		*cr = td->td_ucred;
 	char		*osname;
 	int		error = 0;
 	int		canwrite;
 
 #ifdef illumos
 	if (mvp->v_type != VDIR)
 		return (SET_ERROR(ENOTDIR));
 
 	mutex_enter(&mvp->v_lock);
 	if ((uap->flags & MS_REMOUNT) == 0 &&
 	    (uap->flags & MS_OVERLAY) == 0 &&
 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 		mutex_exit(&mvp->v_lock);
 		return (SET_ERROR(EBUSY));
 	}
 	mutex_exit(&mvp->v_lock);
 
 	/*
 	 * ZFS does not support passing unparsed data in via MS_DATA.
 	 * Users should use the MS_OPTIONSTR interface; this means
 	 * that all option parsing is already done and the options struct
 	 * can be interrogated.
 	 */
 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
 #else	/* !illumos */
 	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
 		return (SET_ERROR(EPERM));
 
 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
 		return (SET_ERROR(EINVAL));
 #endif	/* illumos */
 
 	/*
 	 * If full-owner-access is enabled and delegated administration is
 	 * turned on, we must set nosuid.
 	 */
 	if (zfs_super_owner &&
 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
 		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
 
 	/*
 	 * Check for mount privilege?
 	 *
 	 * If we don't have privilege then see if
 	 * we have local permission to allow it
 	 */
 	error = secpolicy_fs_mount(cr, mvp, vfsp);
 	if (error) {
 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
 			goto out;
 
 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
 			vattr_t		vattr;
 
 			/*
 			 * Make sure user is the owner of the mount point
 			 * or has sufficient privileges.
 			 */
 
 			vattr.va_mask = AT_UID;
 
 			vn_lock(mvp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(mvp, &vattr, cr)) {
 				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 
 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
 				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 			VOP_UNLOCK(mvp, 0);
 		}
 
 		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
 
 	/*
 	 * Refuse to mount a filesystem if we are in a local zone and the
 	 * dataset is not visible.
 	 */
 	if (!INGLOBALZONE(curthread) &&
 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 #ifdef SECLABEL
 	error = zfs_mount_label_policy(vfsp, osname);
 	if (error)
 		goto out;
 #endif
 
 	vfsp->vfs_flag |= MNT_NFS4ACLS;
 
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
 	 */
 	if (vfsp->vfs_flag & MS_REMOUNT) {
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 		/*
 		 * Refresh mount options with z_teardown_lock blocking I/O while
 		 * the filesystem is in an inconsistent state.
 		 * The lock also serializes this code with filesystem
 		 * manipulations between entry to zfs_suspend_fs() and return
 		 * from zfs_resume_fs().
 		 */
 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 		zfs_unregister_callbacks(zfsvfs);
 		error = zfs_register_callbacks(vfsp);
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		goto out;
 	}
 
 	/* Initial root mount: try hard to import the requested root pool. */
 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
 		char pname[MAXNAMELEN];
 
 		error = getpoolname(osname, pname);
 		if (error == 0)
 			error = spa_import_rootpool(pname);
 		if (error)
 			goto out;
 	}
 	DROP_GIANT();
 	error = zfs_domount(vfsp, osname);
 	PICKUP_GIANT();
 
 #ifdef illumos
 	/*
 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
 	 * disappear due to a forced unmount.
 	 */
 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
 		VFS_HOLD(mvp->v_vfsp);
 #endif
 
 out:
 	return (error);
 }
 
 static int
 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
 
 	statp->f_version = STATFS_VERSION;
 
 	ZFS_ENTER(zfsvfs);
 
 	dmu_objset_space(zfsvfs->z_os,
 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
 
 	/*
 	 * The underlying storage pool actually uses multiple block sizes.
 	 * We report the fragsize as the smallest block size we support,
 	 * and we report our blocksize as the filesystem's maximum blocksize.
 	 */
 	statp->f_bsize = SPA_MINBLOCKSIZE;
 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
 
 	/*
 	 * The following report "total" blocks of various kinds in the
 	 * file system, but reported in terms of f_frsize - the
 	 * "fragment" size.
 	 */
 
 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
 	statp->f_bfree = availbytes / statp->f_bsize;
 	statp->f_bavail = statp->f_bfree; /* no root reservation */
 
 	/*
 	 * statvfs() should really be called statufs(), because it assumes
 	 * static metadata.  ZFS doesn't preallocate files, so the best
 	 * we can do is report the max that could possibly fit in f_files,
 	 * and that minus the number actually used in f_ffree.
 	 * For f_ffree, report the smaller of the number of object available
 	 * and the number of blocks (each object will take at least a block).
 	 */
 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
 	statp->f_files = statp->f_ffree + usedobjs;
 
 	/*
 	 * We're a zfs filesystem.
 	 */
 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
 
 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 	    sizeof(statp->f_mntfromname));
 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 	    sizeof(statp->f_mntonname));
 
 	statp->f_namemax = ZFS_MAXNAMELEN;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	znode_t *rootzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 	if (error == 0)
 		*vpp = ZTOV(rootzp);
 
 	ZFS_EXIT(zfsvfs);
 
 	if (error == 0) {
 		error = vn_lock(*vpp, flags);
 		if (error == 0)
 			(*vpp)->v_vflag |= VV_ROOT;
 	}
 	if (error != 0)
 		*vpp = NULL;
 
 	return (error);
 }
 
 /*
  * Teardown the zfsvfs::z_os.
  *
  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
  * and 'z_teardown_inactive_lock' held.
  */
 static int
 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 {
 	znode_t	*zp;
 
 	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 
 	if (!unmounting) {
 		/*
 		 * We purge the parent filesystem's vfsp as the parent
 		 * filesystem and all of its snapshots have their vnode's
 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
 		 * 'z_parent' is self referential for non-snapshots.
 		 */
 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 #ifdef FREEBSD_NAMECACHE
 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
 #endif
 	}
 
 	/*
 	 * Close the zil. NB: Can't close the zil while zfs_inactive
 	 * threads are blocked as zil_close can call zfs_inactive.
 	 */
 	if (zfsvfs->z_log) {
 		zil_close(zfsvfs->z_log);
 		zfsvfs->z_log = NULL;
 	}
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
 
 	/*
 	 * If we are not unmounting (ie: online recv) and someone already
 	 * unmounted this file system while we were doing the switcheroo,
 	 * or a reopen of z_os failed then just bail out now.
 	 */
 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * At this point there are no vops active, and any new vops will
 	 * fail with EIO since we have z_teardown_lock for writer (only
 	 * relavent for forced unmount).
 	 *
 	 * Release all holds on dbufs.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
 		if (zp->z_sa_hdl) {
 			ASSERT(ZTOV(zp)->v_count >= 0);
 			zfs_znode_dmu_fini(zp);
 		}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	/*
 	 * If we are unmounting, set the unmounted flag and let new vops
 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
 	 * other vops will fail with EIO.
 	 */
 	if (unmounting) {
 		zfsvfs->z_unmounted = B_TRUE;
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	}
 
 	/*
 	 * z_os will be NULL if there was an error in attempting to reopen
 	 * zfsvfs, so just return as the properties had already been
 	 * unregistered and cached data had been evicted before.
 	 */
 	if (zfsvfs->z_os == NULL)
 		return (0);
 
 	/*
 	 * Unregister properties.
 	 */
 	zfs_unregister_callbacks(zfsvfs);
 
 	/*
 	 * Evict cached data
 	 */
 	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
 	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 	dmu_objset_evict_dbufs(zfsvfs->z_os);
 
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_umount(vfs_t *vfsp, int fflag)
 {
 	kthread_t *td = curthread;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	objset_t *os;
 	cred_t *cr = td->td_ucred;
 	int ret;
 
 	ret = secpolicy_fs_unmount(cr, vfsp);
 	if (ret) {
 		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
 		    ZFS_DELEG_PERM_MOUNT, cr))
 			return (ret);
 	}
 
 	/*
 	 * We purge the parent filesystem's vfsp as the parent filesystem
 	 * and all of its snapshots have their vnode's v_vfsp set to the
 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
 	 * referential for non-snapshots.
 	 */
 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 
 	/*
 	 * Unmount any snapshots mounted under .zfs before unmounting the
 	 * dataset itself.
 	 */
 	if (zfsvfs->z_ctldir != NULL) {
 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 			return (ret);
 		ret = vflush(vfsp, 0, 0, td);
 		ASSERT(ret == EBUSY);
 		if (!(fflag & MS_FORCE)) {
 			if (zfsvfs->z_ctldir->v_count > 1)
 				return (EBUSY);
 			ASSERT(zfsvfs->z_ctldir->v_count == 1);
 		}
 		zfsctl_destroy(zfsvfs);
 		ASSERT(zfsvfs->z_ctldir == NULL);
 	}
 
 	if (fflag & MS_FORCE) {
 		/*
 		 * Mark file system as unmounted before calling
 		 * vflush(FORCECLOSE). This way we ensure no future vnops
 		 * will be called and risk operating on DOOMED vnodes.
 		 */
 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 		zfsvfs->z_unmounted = B_TRUE;
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 	}
 
 	/*
 	 * Flush all the files.
 	 */
 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
 	if (ret != 0) {
 		if (!zfsvfs->z_issnap) {
 			zfsctl_create(zfsvfs);
 			ASSERT(zfsvfs->z_ctldir != NULL);
 		}
 		return (ret);
 	}
 
 #ifdef illumos
 	if (!(fflag & MS_FORCE)) {
 		/*
 		 * Check the number of active vnodes in the file system.
 		 * Our count is maintained in the vfs structure, but the
 		 * number is off by 1 to indicate a hold on the vfs
 		 * structure itself.
 		 *
 		 * The '.zfs' directory maintains a reference of its
 		 * own, and any active references underneath are
 		 * reflected in the vnode count.
 		 */
 		if (zfsvfs->z_ctldir == NULL) {
 			if (vfsp->vfs_count > 1)
 				return (SET_ERROR(EBUSY));
 		} else {
 			if (vfsp->vfs_count > 2 ||
 			    zfsvfs->z_ctldir->v_count > 1)
 				return (SET_ERROR(EBUSY));
 		}
 	}
 #endif
 
 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
 	os = zfsvfs->z_os;
 
 	/*
 	 * z_os will be NULL if there was an error in
 	 * attempting to reopen zfsvfs.
 	 */
 	if (os != NULL) {
 		/*
 		 * Unset the objset user_ptr.
 		 */
 		mutex_enter(&os->os_user_ptr_lock);
 		dmu_objset_set_user(os, NULL);
 		mutex_exit(&os->os_user_ptr_lock);
 
 		/*
 		 * Finally release the objset
 		 */
 		dmu_objset_disown(os, zfsvfs);
 	}
 
 	/*
 	 * We can now safely destroy the '.zfs' directory node.
 	 */
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
 	if (zfsvfs->z_issnap) {
 		vnode_t *svp = vfsp->mnt_vnodecovered;
 
 		if (svp->v_count >= 2)
 			VN_RELE(svp);
 	}
 	zfs_freevfs(vfsp);
 
 	return (0);
 }
 
 static int
 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	int 		err;
 
 	/*
 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
 	 * This will make NFS to switch to LOOKUP instead of using VGET.
 	 */
 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
 		return (EOPNOTSUPP);
 
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
 		VN_RELE(ZTOV(zp));
 		err = EINVAL;
 	}
 	if (err == 0)
 		*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	if (err == 0)
 		err = vn_lock(*vpp, flags);
 	if (err != 0)
 		*vpp = NULL;
 	return (err);
 }
 
 static int
 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
     struct ucred **credanonp, int *numsecflavors, int **secflavors)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 	/*
 	 * If this is regular file system vfsp is the same as
 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
 	 * zfsvfs->z_parent->z_vfs represents parent file system
 	 * which we have to use here, because only this file system
 	 * has mnt_export configured.
 	 */
 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
 	    credanonp, numsecflavors, secflavors));
 }
 
 CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
 CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
 
 static int
 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	uint64_t	object = 0;
 	uint64_t	fid_gen = 0;
 	uint64_t	gen_mask;
 	uint64_t	zp_gen;
 	int 		i, err;
 
 	*vpp = NULL;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * On FreeBSD we can get snapshot's mount point or its parent file
 	 * system mount point depending if snapshot is already mounted or not.
 	 */
 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
 		uint64_t	objsetid = 0;
 		uint64_t	setgen = 0;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
 
 		ZFS_EXIT(zfsvfs);
 
 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 		if (err)
 			return (SET_ERROR(EINVAL));
 		ZFS_ENTER(zfsvfs);
 	}
 
 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
 
 		for (i = 0; i < sizeof (zfid->zf_object); i++)
 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 	} else {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
 	 * we are in the .zfs/shares directory tree.
 	 */
 	if ((fid_gen == 0 &&
 	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
 		*vpp = zfsvfs->z_ctldir;
 		ASSERT(*vpp != NULL);
 		if (object == ZFSCTL_INO_SNAPDIR) {
 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else if (object == zfsvfs->z_shares_dir) {
 			VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else {
 			VN_HOLD(*vpp);
 		}
 		ZFS_EXIT(zfsvfs);
 		err = vn_lock(*vpp, flags);
 		if (err != 0)
 			*vpp = NULL;
 		return (err);
 	}
 
 	gen_mask = -1ULL >> (64 - 8 * i);
 
 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
 	if (err = zfs_zget(zfsvfs, object, &zp)) {
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
 	    sizeof (uint64_t));
 	zp_gen = zp_gen & gen_mask;
 	if (zp_gen == 0)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 		VN_RELE(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	err = vn_lock(*vpp, flags | LK_RETRY);
 	if (err == 0)
 		vnode_create_vobject(*vpp, zp->z_size, curthread);
 	else
 		*vpp = NULL;
 	return (err);
 }
 
 /*
  * Block out VOPs and close zfsvfs_t::z_os
  *
  * Note, if successful, then we return with the 'z_teardown_lock' and
  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
  * dataset and objset intact so that they can be atomically handed off during
  * a subsequent rollback or recv operation and the resume thereafter.
  */
 int
 zfs_suspend_fs(zfsvfs_t *zfsvfs)
 {
 	int error;
 
 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
 		return (error);
 
 	return (0);
 }
 
 /*
  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
  * is an invariant across any of the operations that can be performed while the
  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
  * are the same: the relevant objset and associated dataset are owned by
  * zfsvfs, held, and long held on entry.
  */
 int
 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
 {
 	int err;
 	znode_t *zp;
 	uint64_t sa_obj = 0;
 
 	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
 	/*
 	 * We already own this, so just hold and rele it to update the
 	 * objset_t, as the one we had before may have been evicted.
 	 */
 	VERIFY0(dmu_objset_hold(osname, zfsvfs, &zfsvfs->z_os));
 	VERIFY3P(zfsvfs->z_os->os_dsl_dataset->ds_owner, ==, zfsvfs);
 	VERIFY(dsl_dataset_long_held(zfsvfs->z_os->os_dsl_dataset));
 	dmu_objset_rele(zfsvfs->z_os, zfsvfs);
 
 	/*
 	 * Make sure version hasn't changed
 	 */
 
 	err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
 	    &zfsvfs->z_version);
 
 	if (err)
 		goto bail;
 
 	err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
 	    ZFS_SA_ATTRS, 8, 1, &sa_obj);
 
 	if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
 		goto bail;
 
 	if ((err = sa_setup(zfsvfs->z_os, sa_obj,
 	    zfs_attr_table,  ZPL_END, &zfsvfs->z_attr_table)) != 0)
 		goto bail;
 
 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
 		sa_register_update_callback(zfsvfs->z_os,
 		    zfs_sa_upgrade);
 
 	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
 	zfs_set_fuid_feature(zfsvfs);
 
 	/*
 	 * Attempt to re-establish all the active znodes with
 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
 	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
 	 * when they try to use their znode.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
 		(void) zfs_rezget(zp);
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 bail:
 	/* release the VOPs */
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 
 	if (err) {
 		/*
 		 * Since we couldn't setup the sa framework, try to force
 		 * unmount this file system.
 		 */
 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
 			vfs_ref(zfsvfs->z_vfs);
 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
 		}
 	}
 	return (err);
 }
 
 static void
 zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 #ifdef illumos
 	/*
 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
 	 * from zfs_mount().  Release it here.  If we came through
 	 * zfs_mountroot() instead, we didn't grab an extra hold, so
 	 * skip the VFS_RELE for rootvfs.
 	 */
 	if (zfsvfs->z_issnap && (vfsp != rootvfs))
 		VFS_RELE(zfsvfs->z_parent->z_vfs);
 #endif
 
 	zfsvfs_free(zfsvfs);
 
 	atomic_dec_32(&zfs_active_fs_count);
 }
 
 #ifdef __i386__
 static int desiredvnodes_backup;
 #endif
 
 static void
 zfs_vnodes_adjust(void)
 {
 #ifdef __i386__
 	int newdesiredvnodes;
 
 	desiredvnodes_backup = desiredvnodes;
 
 	/*
 	 * We calculate newdesiredvnodes the same way it is done in
 	 * vntblinit(). If it is equal to desiredvnodes, it means that
 	 * it wasn't tuned by the administrator and we can tune it down.
 	 */
 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
 	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode))));
 	if (newdesiredvnodes == desiredvnodes)
 		desiredvnodes = (3 * newdesiredvnodes) / 4;
 #endif
 }
 
 static void
 zfs_vnodes_adjust_back(void)
 {
 
 #ifdef __i386__
 	desiredvnodes = desiredvnodes_backup;
 #endif
 }
 
 void
 zfs_init(void)
 {
 
 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
 
 	/*
 	 * Initialize .zfs directory structures
 	 */
 	zfsctl_init();
 
 	/*
 	 * Initialize znode cache, vnode ops, etc...
 	 */
 	zfs_znode_init();
 
 	/*
 	 * Reduce number of vnodes. Originally number of vnodes is calculated
 	 * with UFS inode in mind. We reduce it here, because it's too big for
 	 * ZFS/i386.
 	 */
 	zfs_vnodes_adjust();
 
 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 }
 
 void
 zfs_fini(void)
 {
 	zfsctl_fini();
 	zfs_znode_fini();
 	zfs_vnodes_adjust_back();
 }
 
 int
 zfs_busy(void)
 {
 	return (zfs_active_fs_count != 0);
 }
 
 int
 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 {
 	int error;
 	objset_t *os = zfsvfs->z_os;
 	dmu_tx_t *tx;
 
 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
 		return (SET_ERROR(EINVAL));
 
 	if (newvers < zfsvfs->z_version)
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_spa_version_map(newvers) >
 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
 		return (SET_ERROR(ENOTSUP));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    ZFS_SA_ATTRS);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 	    8, 1, &newvers, tx);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		return (error);
 	}
 
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		uint64_t sa_obj;
 
 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
 		    SPA_VERSION_SA);
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 
 		error = zap_add(os, MASTER_NODE_OBJ,
 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT0(error);
 
 		VERIFY(0 == sa_set_sa_object(os, sa_obj));
 		sa_register_update_callback(os, zfs_sa_upgrade);
 	}
 
 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
 	    "from %llu to %llu", zfsvfs->z_version, newvers);
 
 	dmu_tx_commit(tx);
 
 	zfsvfs->z_version = newvers;
 
 	zfs_set_fuid_feature(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Read a property stored within the master node.
  */
 int
 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 {
 	const char *pname;
 	int error = ENOENT;
 
 	/*
 	 * Look up the file system's value for the property.  For the
 	 * version property, we look up a slightly different string.
 	 */
 	if (prop == ZFS_PROP_VERSION)
 		pname = ZPL_VERSION_STR;
 	else
 		pname = zfs_prop_to_name(prop);
 
 	if (os != NULL)
 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
 
 	if (error == ENOENT) {
 		/* No value set, use the default value */
 		switch (prop) {
 		case ZFS_PROP_VERSION:
 			*value = ZPL_VERSION;
 			break;
 		case ZFS_PROP_NORMALIZE:
 		case ZFS_PROP_UTF8ONLY:
 			*value = 0;
 			break;
 		case ZFS_PROP_CASE:
 			*value = ZFS_CASE_SENSITIVE;
 			break;
 		default:
 			return (error);
 		}
 		error = 0;
 	}
 	return (error);
 }
 
 #ifdef _KERNEL
 void
 zfsvfs_update_fromname(const char *oldname, const char *newname)
 {
 	char tmpbuf[MAXPATHLEN];
 	struct mount *mp;
 	char *fromname;
 	size_t oldlen;
 
 	oldlen = strlen(oldname);
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		fromname = mp->mnt_stat.f_mntfromname;
 		if (strcmp(fromname, oldname) == 0) {
 			(void)strlcpy(fromname, newname,
 			    sizeof(mp->mnt_stat.f_mntfromname));
 			continue;
 		}
 		if (strncmp(fromname, oldname, oldlen) == 0 &&
 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
 			(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
 			    newname, fromname + oldlen);
 			(void)strlcpy(fromname, tmpbuf,
 			    sizeof(mp->mnt_stat.f_mntfromname));
 			continue;
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 }
 #endif
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 296519)
@@ -1,7199 +1,7200 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vm.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/atomic.h>
 #include <sys/namei.h>
 #include <sys/mman.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
 #include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sched.h>
 #include <sys/acl.h>
 #include <vm/vm_param.h>
 #include <vm/vm_pageout.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1)	A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
  *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  *	can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
  *	reduces lock contention and CPU usage when we must wait (note that if
  *	throughput is constrained by the storage, nearly every transaction
  *	must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		ZFS_EXIT(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 
 /* ARGSUSED */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(*vpp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 		if (fs_vscan(*vpp, cr, 0) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EACCES));
 		}
 	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	/*
 	 * Clean up any locks held by this process on the vp.
 	 */
 	cleanlocks(vp, ddi_get_pid(), 0);
 	cleanshares(vp, ddi_get_pid());
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 		VERIFY(fs_vscan(vp, cr, 1) == 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 {
 	znode_t	*zp = VTOZ(vp);
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_size;
 	if (noff >= file_sz)  {
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (cmd == _FIO_SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
 	if (error == ESRCH)
 		return (SET_ERROR(ENXIO));
 
 	/*
 	 * We could find a hole that begins after the logical end-of-file,
 	 * because dmu_offset_next() only works on whole blocks.  If the
 	 * EOF falls mid-block, then indicate that the "virtual hole"
 	 * at the end of the file begins at the logical EOF, rather than
 	 * at the end of the last block.
 	 */
 	if (noff > file_sz) {
 		ASSERT(hole);
 		noff = file_sz;
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
     int *rvalp, caller_context_t *ct)
 {
 	offset_t off;
 	offset_t ndata;
 	dmu_object_info_t doi;
 	int error;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp;
 
 	switch (com) {
 	case _FIOFFS:
 	{
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	}
 	case _FIOGDIO:
 	case _FIOSDIO:
 	{
 		return (0);
 	}
 
 	case _FIO_SEEK_DATA:
 	case _FIO_SEEK_HOLE:
 	{
 #ifdef illumos
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
 		off = *(offset_t *)data;
 #endif
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
 #ifdef illumos
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
 		*(offset_t *)data = off;
 #endif
 		return (0);
 	}
 #ifdef illumos
 	case _FIO_COUNT_FILLED:
 	{
 		/*
 		 * _FIO_COUNT_FILLED adds a new ioctl command which
 		 * exposes the number of filled blocks in a
 		 * ZFS object.
 		 */
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/*
 		 * Wait for all dirty blocks for this object
 		 * to get synced out to disk, and the DMU info
 		 * updated.
 		 */
 		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Retrieve fill count from DMU object.
 		 */
 		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		ndata = doi.doi_fill_count;
 
 		ZFS_EXIT(zfsvfs);
 		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
 			return (SET_ERROR(EFAULT));
 		return (0);
 	}
 #endif
 	}
 	return (SET_ERROR(ENOTTY));
 }
 
 static vm_page_t
 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t end;
 
 	/*
 	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
 	 * aligned boundaries, if the range is not aligned.  As a result a
 	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
 	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
 	 * the whole page would be considred clean despite have some dirty data.
 	 * For this reason we should shrink the range to DEV_BSIZE aligned
 	 * boundaries before calling vm_page_clear_dirty.
 	 */
 	end = rounddown2(off + nbytes, DEV_BSIZE);
 	off = roundup2(off, DEV_BSIZE);
 	nbytes = end - off;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_lock(pp);
 				zfs_vmobject_wunlock(obj);
 				vm_page_busy_sleep(pp, "zfsmwb");
 				zfs_vmobject_wlock(obj);
 				continue;
 			}
 			vm_page_sbusy(pp);
 		} else if (pp == NULL) {
 			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
 			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
 			    VM_ALLOC_SBUSY);
 		} else {
 			ASSERT(pp != NULL && !pp->valid);
 			pp = NULL;
 		}
 
 		if (pp != NULL) {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_object_pip_add(obj, 1);
 			pmap_remove_write(pp);
 			if (nbytes != 0)
 				vm_page_clear_dirty(pp, off, nbytes);
 		}
 		break;
 	}
 	return (pp);
 }
 
 static void
 page_unbusy(vm_page_t pp)
 {
 
 	vm_page_sunbusy(pp);
 	vm_object_pip_subtract(pp->object, 1);
 }
 
 static vm_page_t
 page_hold(vnode_t *vp, int64_t start)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_lock(pp);
 				zfs_vmobject_wunlock(obj);
 				vm_page_busy_sleep(pp, "zfsmwb");
 				zfs_vmobject_wlock(obj);
 				continue;
 			}
 
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_lock(pp);
 			vm_page_hold(pp);
 			vm_page_unlock(pp);
 
 		} else
 			pp = NULL;
 		break;
 	}
 	return (pp);
 }
 
 static void
 page_unhold(vm_page_t pp)
 {
 
 	vm_page_lock(pp);
 	vm_page_unhold(pp);
 	vm_page_unlock(pp);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  */
 static void
 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
     int segflg, dmu_tx_t *tx)
 {
 	vm_object_t obj;
 	struct sf_buf *sf;
 	caddr_t va;
 	int off;
 
 	ASSERT(segflg != UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		int nbytes = imin(PAGESIZE - off, len);
 
 		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 			zfs_vmobject_wunlock(obj);
 
 			va = zfs_map_page(pp, &sf);
 			(void) dmu_read(os, oid, start+off, nbytes,
 			    va+off, DMU_READ_PREFETCH);;
 			zfs_unmap_page(sf);
 
 			zfs_vmobject_wlock(obj);
 			page_unbusy(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
 	vm_object_pip_wakeupn(obj, 0);
 	zfs_vmobject_wunlock(obj);
 }
 
 /*
  * Read with UIO_NOCOPY flag means that sendfile(2) requests
  * ZFS to populate a range of page cache pages with data.
  *
  * NOTE: this function could be optimized to pre-allocate
  * all pages in advance, drain exclusive busy on all of them,
  * map them into contiguous KVA region and populate them
  * in one single dmu_read() call.
  */
 static int
 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	struct sf_buf *sf;
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(uio->uio_segflg == UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
 
 	zfs_vmobject_wlock(obj);
 	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
 		int bytes = MIN(PAGESIZE, len);
 
 		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
 		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 		if (pp->valid == 0) {
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 			error = dmu_read(os, zp->z_id, start, bytes, va,
 			    DMU_READ_PREFETCH);
 			if (bytes != PAGESIZE && error == 0)
 				bzero(va + bytes, PAGESIZE - bytes);
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			vm_page_sunbusy(pp);
 			vm_page_lock(pp);
 			if (error) {
 				if (pp->wire_count == 0 && pp->valid == 0 &&
 				    !vm_page_busied(pp))
 					vm_page_free(pp);
 			} else {
 				pp->valid = VM_PAGE_BITS_ALL;
 				vm_page_activate(pp);
 			}
 			vm_page_unlock(pp);
 		} else {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_sunbusy(pp);
 		}
 		if (error)
 			break;
 		uio->uio_resid -= bytes;
 		uio->uio_offset += bytes;
 		len -= bytes;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	 the file is memory mapped.
  */
 static int
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	vm_object_t obj;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 		if (pp = page_hold(vp, start)) {
 			struct sf_buf *sf;
 			caddr_t va;
 
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 			error = uiomove(va + off, bytes, UIO_READ, uio);
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			page_unhold(pp);
 		} else {
 			zfs_vmobject_wunlock(obj);
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes);
 			zfs_vmobject_wlock(obj);
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	vp	- vnode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Side Effects:
  *	vp - atime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	ssize_t		n, nbytes;
 	int		error = 0;
 	rl_t		*rl;
 	xuio_t		*xuio = NULL;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Validate file offset
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (uio->uio_resid == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE(zp->z_mode)) {
 		if (error = chklock(vp, FREAD,
 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 */
 	if (zfsvfs->z_log &&
 	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_size);
 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 
 #ifdef illumos
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
 		int blksz = zp->z_blksz;
 		uint64_t offset = uio->uio_loffset;
 
 		xuio = (xuio_t *)uio;
 		if ((ISP2(blksz))) {
 			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 			    blksz)) / blksz;
 		} else {
 			ASSERT(offset + n <= blksz);
 			nblk = 1;
 		}
 		(void) dmu_xuio_init(xuio, nblk);
 
 		if (vn_has_cached_data(vp)) {
 			/*
 			 * For simplicity, we always allocate a full buffer
 			 * even if we only expect to read a portion of a block.
 			 */
 			while (--nblk >= 0) {
 				(void) dmu_xuio_add(xuio,
 				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    blksz), 0, blksz);
 			}
 		}
 	}
 #endif	/* illumos */
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 #ifdef __FreeBSD__
 		if (uio->uio_segflg == UIO_NOCOPY)
 			error = mappedread_sf(vp, nbytes, uio);
 		else
 #endif /* __FreeBSD__ */
 		if (vn_has_cached_data(vp)) {
 			error = mappedread(vp, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes);
 		}
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 
 		n -= nbytes;
 	}
 out:
 	zfs_range_unlock(rl);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	vp	- vnode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
  *			  set if in append mode.
  *		cr	- credentials of caller.
  *		ct	- caller context (NFS/CIFS fem monitor only)
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated if byte count > 0
  */
 
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	rlim64_t	limit = MAXOFFSET_T;
 	ssize_t		start_resid = uio->uio_resid;
 	ssize_t		tx_bytes;
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error = 0;
 	arc_buf_t	*abuf;
 	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
 	int		i_iov = 0;
 	int		iovcnt = uio->uio_iovcnt;
 	iovec_t		*iovp = uio->uio_iov;
 	int		write_eof;
 	int		count = 0;
 	sa_bulk_attr_t	bulk[4];
 	uint64_t	mtime[2], ctime[2];
 
 	/*
 	 * Fasttrack empty write
 	 */
 	n = start_resid;
 	if (n == 0)
 		return (0);
 
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/*
 	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 	 * callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * If immutable or not appending then return EPERM
 	 */
 	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 	    (uio->uio_loffset < zp->z_size))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Validate file offset
 	 */
 	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check for mandatory locks before calling zfs_range_lock()
 	 * in order to prevent a deadlock with locks set via fcntl().
 	 */
 	if (MANDMODE((mode_t)zp->z_mode) &&
 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef illumos
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 * Skip this if uio contains loaned arc_buf.
 	 */
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 		xuio = (xuio_t *)uio;
 	else
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & FAPPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
 		 */
 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 		woff = rl->r_off;
 		if (rl->r_len == UINT64_MAX) {
 			/*
 			 * We overlocked the file because this write will cause
 			 * the file block size to increase.
 			 * Note that zp_size cannot change with this lock held.
 			 */
 			woff = zp->z_size;
 		}
 		uio->uio_loffset = woff;
 	} else {
 		/*
 		 * Note that if the file block size will change as a result of
 		 * this write, then this range lock will lock the entire file
 		 * so that we can re-write the block safely.
 		 */
 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 	}
 
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (EFBIG);
 	}
 
 	if (woff >= limit) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EFBIG));
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
 	write_eof = (woff + n > zp->z_size);
 
 	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
 		if (xuio && abuf == NULL) {
 			ASSERT(i_iov < iovcnt);
 			aiov = &iovp[i_iov];
 			abuf = dmu_xuio_arcbuf(xuio, i_iov);
 			dmu_xuio_clear(xuio, i_iov);
 			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 			    iovec_t *, aiov, arc_buf_t *, abuf);
 			ASSERT((aiov->iov_base == abuf->b_data) ||
 			    ((char *)aiov->iov_base - (char *)abuf->b_data +
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 		} else if (abuf == NULL && n >= max_blksz &&
 		    woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
 			 * a transaction.  This avoids the possibility of
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
 			size_t cbytes;
 
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    max_blksz);
 			ASSERT(abuf != NULL);
 			ASSERT(arc_buf_size(abuf) == max_blksz);
 			if (error = uiocopy(abuf->b_data, max_blksz,
 			    UIO_WRITE, uio, &cbytes)) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
 			ASSERT(cbytes == max_blksz);
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			break;
 		}
 
 		/*
 		 * If zfs_range_lock() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since zfs_range_reduce() will
 		 * shrink down r_len to the appropriate size.
 		 */
 		if (rl->r_len == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
 				/*
 				 * File's blocksize is already larger than the
 				 * "recordsize" property.  Only let it grow to
 				 * the next power of 2.
 				 */
 				ASSERT(!ISP2(zp->z_blksz));
 				new_blksz = MIN(end_size,
 				    1 << highbit64(zp->z_blksz));
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
 			zfs_range_reduce(rl, woff, n);
 		}
 
 		/*
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
 		if (woff + nbytes > zp->z_size)
 			vnode_pager_setsize(vp, woff + nbytes);
 
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
 			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 			/*
 			 * If this is not a full block write, but we are
 			 * extending the file past EOF and this data starts
 			 * block-aligned, use assign_arcbuf().  Otherwise,
 			 * write via dmu_write().
 			 */
 			if (tx_bytes < max_blksz && (!write_eof ||
 			    aiov->iov_base != abuf->b_data)) {
 				ASSERT(xuio);
 				dmu_write(zfsvfs->z_os, zp->z_id, woff,
 				    aiov->iov_len, aiov->iov_base, tx);
 				dmu_return_arcbuf(abuf);
 				xuio_stat_wbuf_copied();
 			} else {
 				ASSERT(xuio || tx_bytes == max_blksz);
 				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    woff, abuf, tx);
 			}
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
 		}
 		if (tx_bytes && vn_has_cached_data(vp)) {
 			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
 			    zp->z_id, uio->uio_segflg, tx);
 		}
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		/*
 		 * Clear Set-UID/Set-GID bits on successful write if not
 		 * privileged and at least one of the excute bits is set.
 		 *
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
 		 *
 		 * Note: we don't call zfs_fuid_map_id() here because
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 		    secpolicy_vnode_setid_retain(vp, cr,
 		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 			uint64_t newmode;
 			zp->z_mode &= ~(S_ISUID | S_ISGID);
 			newmode = zp->z_mode;
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 			    (void *)&newmode, sizeof (uint64_t), tx);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
 			ASSERT(error == 0);
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
 		 * the file size to the specified eof. Note, there's no
 		 * concurrency during replay.
 		 */
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 
 #ifdef illumos
 		if (!xuio && n > 0)
 			uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif
 	}
 
 	zfs_range_unlock(rl);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (ioflag & (FSYNC | FDSYNC) ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 void
 zfs_get_done(zgd_t *zgd, int error)
 {
 	znode_t *zp = zgd->zgd_private;
 	objset_t *os = zp->z_zfsvfs->z_os;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_range_unlock(zgd->zgd_rl);
 
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
 	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 #ifdef DEBUG
 static int zil_fault_io = 0;
 #endif
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
 
 	ASSERT(zio != NULL);
 	ASSERT(size != 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (SET_ERROR(ENOENT));
 	if (zp->z_unlinked) {
 		/*
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 		return (SET_ERROR(ENOENT));
 	}
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_zilog = zfsvfs->z_log;
 	zgd->zgd_private = zp;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = SET_ERROR(ENOENT);
 		} else {
 			error = dmu_read(os, object, offset, size, buf,
 			    DMU_READ_NO_PREFETCH);
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and it's checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			uint64_t blkoff;
 			size = zp->z_blksz;
 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 			offset -= blkoff;
 			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
 			    RL_READER);
 			if (zp->z_blksz == size)
 				break;
 			offset += blkoff;
 			zfs_range_unlock(zgd->zgd_rl);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (lr->lr_offset >= zp->z_size)
 			error = SET_ERROR(ENOENT);
 #ifdef DEBUG
 		if (zil_fault_io) {
 			error = SET_ERROR(EIO);
 			zil_fault_io = 0;
 		}
 #endif
 		if (error == 0)
 			error = dmu_buf_hold(os, object, offset, zgd, &db,
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
 			blkptr_t *obp = dmu_buf_get_blkptr(db);
 			if (obp) {
 				ASSERT(BP_IS_HOLE(bp));
 				*bp = *obp;
 			}
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zfs_get_done, zgd);
 			ASSERT(error || lr->lr_length <= zp->z_blksz);
 
 			/*
 			 * On success, we need to wait for the write I/O
 			 * initiated by dmu_sync() to complete before we can
 			 * release this dbuf.  We will finish everything up
 			 * in the zfs_get_done() callback.
 			 */
 			if (error == 0)
 				return (0);
 
 			if (error == EALREADY) {
 				lr->lr_common.lrc_txtype = TX_WRITE2;
 				error = 0;
 			}
 		}
 	}
 
 	zfs_get_done(zgd, error);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (flag & V_ACE_MASK)
 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
 	else
 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * If vnode is for a device return a specfs vnode instead.
  */
 static int
 specvp_check(vnode_t **vpp, cred_t *cr)
 {
 	int error = 0;
 
 	if (IS_DEVVP(*vpp)) {
 		struct vnode *svp;
 
 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 		VN_RELE(*vpp);
 		if (svp == NULL)
 			error = SET_ERROR(ENOSYS);
 		*vpp = svp;
 	}
 	return (error);
 }
 
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		direntflags - directory lookup flags
  *		realpnp - returned pathname.
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
     int nameiop, cred_t *cr, kthread_t *td, int flags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
 	int *direntflags = NULL;
 	void *realpnp = NULL;
 
 	/* fast path */
 	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 
 		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 			error = zfs_fastaccesschk_execute(zdp, cr);
 			if (!error) {
 				*vpp = dvp;
 				VN_HOLD(*vpp);
 				return (0);
 			}
 			return (error);
 		} else {
 			vnode_t *tvp = dnlc_lookup(dvp, nm);
 
 			if (tvp) {
 				error = zfs_fastaccesschk_execute(zdp, cr);
 				if (error) {
 					VN_RELE(tvp);
 					return (error);
 				}
 				if (tvp == DNLC_NO_VNODE) {
 					VN_RELE(tvp);
 					return (SET_ERROR(ENOENT));
 				} else {
 					*vpp = tvp;
 					return (specvp_check(vpp, cr));
 				}
 			}
 		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zdp);
 
 	*vpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 #ifdef TODO
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 #endif
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
 		    B_FALSE, cr)) {
 			VN_RELE(*vpp);
 			*vpp = NULL;
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (dvp->v_type != VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOTDIR));
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
 	if (error == 0)
 		error = specvp_check(vpp, cr);
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
 		case CREATE:
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
 				cnp->cn_flags |= SAVENAME;
 				break;
 			}
 			/* FALLTHROUGH */
 		case DELETE:
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
 			break;
 		}
 	}
 	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
 		int ltype = 0;
 
 		if (cnp->cn_flags & ISDOTDOT) {
 			ltype = VOP_ISLOCKED(dvp);
 			VOP_UNLOCK(dvp, 0);
 		}
 		ZFS_EXIT(zfsvfs);
 		error = vn_lock(*vpp, cnp->cn_lkflags);
 		if (cnp->cn_flags & ISDOTDOT)
 			vn_lock(dvp, ltype | LK_RETRY);
 		if (error != 0) {
 			VN_RELE(*vpp);
 			*vpp = NULL;
 			return (error);
 		}
 	} else {
 		ZFS_EXIT(zfsvfs);
 	}
 
 #ifdef FREEBSD_NAMECACHE
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(dvp, *vpp, cnp);
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
 #endif
 
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
     vnode_t **vpp, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	have_acl = B_FALSE;
 	boolean_t	waited = B_FALSE;
 	void		*vsecp = NULL;
 	int		flag = 0;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	getnewvnode_reserve(1);
 
 top:
 	*vpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
 	if (*name == '\0') {
 		/*
 		 * Null component name refers to the directory itself.
 		 */
 		VN_HOLD(dvp);
 		zp = dzp;
 		dl = NULL;
 		error = 0;
 	} else {
 		/* possible VN_HOLD(zp) */
 		int zflg = 0;
 
 		if (flag & FIGNORECASE)
 			zflg |= ZCILOOK;
 
 		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 		    NULL, NULL);
 		if (error) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			if (strcmp(name, "..") == 0)
 				error = SET_ERROR(EISDIR);
 			getnewvnode_drop_reserve();
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if (zp == NULL) {
 		uint64_t txtype;
 
 		/*
 		 * Create a new file object and update the directory
 		 * to reference it.
 		 */
 		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			goto out;
 		}
 
 		/*
 		 * We only support the creation of regular files in
 		 * extended attribute directories.
 		 */
 
 		if ((dzp->z_pflags & ZFS_XATTR) &&
 		    (vap->va_type != VREG)) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 		    cr, vsecp, &acl_ids)) != 0)
 			goto out;
 		have_acl = B_TRUE;
 
 		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 			zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EDQUOT);
 			goto out;
 		}
 
 		tx = dmu_tx_create(os);
 
 		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 		    ZFS_SA_BASE_ATTR_SIZE);
 
 		fuid_dirtied = zfsvfs->z_fuid_dirty;
 		if (fuid_dirtied)
 			zfs_fuid_txhold(zfsvfs, tx);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 		if (!zfsvfs->z_use_sa &&
 		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, acl_ids.z_aclp->z_acl_bytes);
 		}
 		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			if (error == ERESTART) {
 				waited = B_TRUE;
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			zfs_acl_ids_free(&acl_ids);
 			dmu_tx_abort(tx);
 			getnewvnode_drop_reserve();
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 		if (fuid_dirtied)
 			zfs_fuid_sync(zfsvfs, tx);
 
 		(void) zfs_link_create(dl, zp, tx, ZNEW);
 		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 		if (flag & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 		    vsecp, acl_ids.z_fuidp, vap);
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 	} else {
 		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
 
 		if (have_acl)
 			zfs_acl_ids_free(&acl_ids);
 		have_acl = B_FALSE;
 
 		/*
 		 * A directory entry already exists for this name.
 		 */
 		/*
 		 * Can't truncate an existing file if in exclusive mode.
 		 */
 		if (excl == EXCL) {
 			error = SET_ERROR(EEXIST);
 			goto out;
 		}
 		/*
 		 * Can't open a directory for writing.
 		 */
 		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
 			error = SET_ERROR(EISDIR);
 			goto out;
 		}
 		/*
 		 * Verify requested access to file.
 		 */
 		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
 			goto out;
 		}
 
 		mutex_enter(&dzp->z_lock);
 		dzp->z_seq++;
 		mutex_exit(&dzp->z_lock);
 
 		/*
 		 * Truncate regular files if requested.
 		 */
 		if ((ZTOV(zp)->v_type == VREG) &&
 		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
 			/* we can't hold any locks when calling zfs_freesp() */
 			zfs_dirent_unlock(dl);
 			dl = NULL;
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
 			if (error == 0) {
 				vnevent_create(ZTOV(zp), ct);
 			}
 		}
 	}
 out:
 	getnewvnode_drop_reserve();
 	if (dl)
 		zfs_dirent_unlock(dl);
 
 	if (error) {
 		if (zp)
 			VN_RELE(ZTOV(zp));
 	} else {
 		*vpp = ZTOV(zp);
 		error = specvp_check(vpp, cr);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 
 uint64_t null_xattr = 0;
 
 /*ARGSUSED*/
 static int
 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
     int flags)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	znode_t		*xzp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	uint64_t	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	pathname_t	*realnmp = NULL;
 	pathname_t	realnm;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE) {
 		zflg |= ZCILOOK;
 		pn_alloc(&realnm);
 		realnmp = &realnm;
 	}
 
 top:
 	xattr_obj = 0;
 	xzp = NULL;
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, realnmp)) {
 		if (realnmp)
 			pn_free(realnmp);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	vnevent_remove(vp, dvp, name, ct);
 
 	if (realnmp)
 		dnlc_remove(dvp, realnmp->pn_buf);
 	else
 		dnlc_remove(dvp, name);
 
 	VI_LOCK(vp);
 	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
 	VI_UNLOCK(vp);
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	obj = zp->z_id;
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	if (may_delete_now) {
 		toobig =
 		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
 		/* if the file is too big, only hold_free a token amount */
 		dmu_tx_hold_free(tx, zp->z_id, 0,
 		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
 	}
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	mutex_enter(&zp->z_lock);
 	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 	mutex_exit(&zp->z_lock);
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (xzp)
 			VN_RELE(ZTOV(xzp));
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		if (realnmp)
 			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		/*
 		 * Hold z_lock so that we can make sure that the ACL obj
 		 * hasn't changed.  Could have been deleted due to
 		 * zfs_sa_upgrade().
 		 */
 		mutex_enter(&zp->z_lock);
 		VI_LOCK(vp);
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
 		delete_now = may_delete_now && !toobig &&
 		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
 		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
 		    acl_obj;
 		VI_UNLOCK(vp);
 	}
 
 	if (delete_now) {
 #ifdef __FreeBSD__
 		panic("zfs_remove: delete_now branch taken");
 #endif
 		if (xattr_obj_unlinked) {
 			ASSERT3U(xzp->z_links, ==, 2);
 			mutex_enter(&xzp->z_lock);
 			xzp->z_unlinked = 1;
 			xzp->z_links = 0;
 			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 			    &xzp->z_links, sizeof (xzp->z_links), tx);
 			ASSERT3U(error,  ==,  0);
 			mutex_exit(&xzp->z_lock);
 			zfs_unlinked_add(xzp, tx);
 
 			if (zp->z_is_sa)
 				error = sa_remove(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), tx);
 			else
 				error = sa_update(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
 				    sizeof (uint64_t), tx);
 			ASSERT0(error);
 		}
 		VI_LOCK(vp);
 		vp->v_count--;
 		ASSERT0(vp->v_count);
 		VI_UNLOCK(vp);
 		mutex_exit(&zp->z_lock);
 		zfs_znode_delete(zp, tx);
 	} else if (unlinked) {
 		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
 #ifdef __FreeBSD__
 		vp->v_vflag |= VV_NOSYNC;
 #endif
 	}
 
 	txtype = TX_REMOVE;
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
 	if (realnmp)
 		pn_free(realnmp);
 
 	zfs_dirent_unlock(dl);
 
 	if (!delete_now)
 		VN_RELE(vp);
 	if (xzp)
 		VN_RELE(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 /*ARGSUSED*/
 static int
 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
     caller_context_t *ct, int flags, vsecattr_t *vsecp)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zf = ZNEW;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(vap->va_type == VDIR);
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    vsecp, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	getnewvnode_reserve(1);
 
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 top:
 	*vpp = NULL;
 
 	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
 	    NULL, NULL)) {
 		zfs_acl_ids_free(&acl_ids);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
 	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	zfs_dirent_unlock(dl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 top:
 	zp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, NULL)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	if (vp == cwd) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	vnevent_rmdir(vp, dvp, name, ct);
 
 	/*
 	 * Grab a lock on the directory to make sure that noone is
 	 * trying to add (or lookup) entries while we are removing it.
 	 */
 	rw_enter(&zp->z_name_lock, RW_WRITER);
 
 	/*
 	 * Grab a lock on the parent pointer to make sure we play well
 	 * with the treewalk and directory rename code.
 	 */
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(dvp);
 #endif
 
 	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
 	rw_exit(&zp->z_parent_lock);
 	rw_exit(&zp->z_name_lock);
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
 #endif
 out:
 	zfs_dirent_unlock(dl);
 
 	VN_RELE(vp);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure).
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 /* ARGSUSED */
 static int
 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	edirent_t	*eodp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	uint64_t	parent;
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	boolean_t	check_sysattrs;
 	uint8_t		type;
 	int		ncooks;
 	u_long		*cooks = NULL;
 	int		flags = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (uio->uio_iov->iov_len <= 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = zp->z_unlinked) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = uio->uio_loffset;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = uio->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
 		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 	eodp = (struct edirent *)odp;
 
 	if (ncookies != NULL) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
 		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
 	/*
 	 * If this VFS supports the system attribute view interface; and
 	 * we're looking at an extended attribute directory; and we care
 	 * about normalization conflicts on this vfs; then we must check
 	 * for normalization conflicts with the sysattr name space.
 	 */
 #ifdef TODO
 	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
 	    (flags & V_RDDIR_ENTFLAGS);
 #else
 	check_sysattrs = 0;
 #endif
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 		off64_t *next = NULL;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if (error = zap_cursor_retrieve(&zc, &zap)) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 
 			if (check_sysattrs && !zap.za_normalization_conflict) {
 #ifdef TODO
 				zap.za_normalization_conflict =
 				    xattr_sysattr_casechk(zap.za_name);
 #else
 				panic("%s:%u: TODO", __func__, __LINE__);
 #endif
 			}
 		}
 
 		if (flags & V_RDDIR_ACCFILTER) {
 			/*
 			 * If we have no access at all, don't include
 			 * this entry in the returned information
 			 */
 			znode_t	*ezp;
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
 				VN_RELE(ZTOV(ezp));
 				goto skip_entry;
 			}
 			VN_RELE(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
 		else
 			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = SET_ERROR(EINVAL);
 				goto update;
 			}
 			break;
 		}
 		if (flags & V_RDDIR_ENTFLAGS) {
 			/*
 			 * Add extended flag entry:
 			 */
 			eodp->ed_ino = objnum;
 			eodp->ed_reclen = reclen;
 			/* NOTE: ed_off is the offset for the *next* entry */
 			next = &(eodp->ed_off);
 			eodp->ed_eflags = zap.za_normalization_conflict ?
 			    ED_CASE_CONFLICT : 0;
 			(void) strncpy(eodp->ed_name, zap.za_name,
 			    EDIRENT_NAMELEN(reclen));
 			eodp = (edirent_t *)((intptr_t)eodp + reclen);
 		} else {
 			/*
 			 * Add normal entry:
 			 */
 			odp->d_ino = objnum;
 			odp->d_reclen = reclen;
 			odp->d_namlen = strlen(zap.za_name);
 			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
 			odp->d_type = type;
 			odp = (dirent64_t *)((intptr_t)odp + reclen);
 		}
 		outcount += reclen;
 
 		ASSERT(outcount <= bufsize);
 
 		/* Prefetch znode */
 		if (prefetch)
 			dmu_prefetch(os, objnum, 0, 0, 0,
 			    ZIO_PRIORITY_SYNC_READ);
 
 	skip_entry:
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	/* Subtract unused cookies */
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		uio->uio_resid -= outcount;
 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = uio->uio_loffset;
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	uio->uio_loffset = offset;
 	ZFS_EXIT(zfsvfs);
 	if (error != 0 && cookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 	return (error);
 }
 
 ulong_t zfs_fsync_sync_cnt = 4;
 
 static int
 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		zil_commit(zfsvfs->z_log, zp->z_id);
 		ZFS_EXIT(zfsvfs);
 	}
 	return (0);
 }
 
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *			  If AT_XVATTR set, then optional attrs are requested
  *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds).
  */
 /* ARGSUSED */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int	error = 0;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	uint64_t links;
 	uint64_t mtime[2], ctime[2], crtime[2], rdev;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 
 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
 	    (vap->va_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 
 	mutex_enter(&zp->z_lock);
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
 #ifdef illumos
 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
 #else
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 #endif
 	vap->va_nodeid = zp->z_id;
 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
 		links = zp->z_links + 1;
 	else
 		links = zp->z_links;
 	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
 	vap->va_size = zp->z_size;
 #ifdef illumos
 	vap->va_rdev = vp->v_rdev;
 #else
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		vap->va_rdev = zfs_cmpldev(rdev);
 #endif
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
      	vap->va_filerev = zp->z_seq;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
 	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
 			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
 		    vp->v_type == VREG) {
 			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 			uint64_t times[2];
 
 			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
 			    times, sizeof (times));
 			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
 			XVA_SET_RTN(xvap, XAT_CREATETIME);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
 			xoap->xoa_generation = zp->z_gen;
 			XVA_SET_RTN(xvap, XAT_GEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 			xoap->xoa_offline =
 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
 			XVA_SET_RTN(xvap, XAT_OFFLINE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 			xoap->xoa_sparse =
 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_SPARSE);
 		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
 	mutex_exit(&zp->z_lock);
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	vp	- vnode of file to be modified.
  *		vap	- new attribute values.
  *			  If AT_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 /* ARGSUSED */
 static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2];
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
 	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & AT_XVATTR))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & AT_SIZE && vp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If this is an xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 
 	xva_init(&tmpxvattr);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (AT_ATIME | AT_MTIME)) {
 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EOVERFLOW));
 		}
 	}
 
 top:
 	attrzp = NULL;
 	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr);
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) &&
 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, vp, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	mutex_enter(&zp->z_lock);
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((vp->v_type != VREG &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EPERM));
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				ZFS_EXIT(zfsvfs);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Save the mode, as secpolicy_vnode_setattr()
 				 * will overwrite it with ova.va_mode.
 				 */
 				saved_mode = vap->va_mode;
 			}
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 
 		if (trim_mask) {
 			vap->va_mask |= saved_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Recover the mode after
 				 * secpolicy_vnode_setattr().
 				 */
 				vap->va_mode = saved_mode;
 			}
 		}
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (AT_UID | AT_GID))) {
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
 			if (err)
 				goto out2;
 		}
 		if (mask & AT_UID) {
 			new_uid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_uid != zp->z_uid &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
 				if (attrzp)
 					VN_RELE(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & AT_GID) {
 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
 			    cr, ZFS_GROUP, &fuidp);
 			if (new_gid != zp->z_gid &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
 				if (attrzp)
 					VN_RELE(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = SET_ERROR(EPERM);
 			goto out;
 		}
 
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
 
 		mutex_enter(&zp->z_lock);
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		mutex_exit(&zp->z_lock);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & AT_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
 	mutex_enter(&zp->z_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		mutex_enter(&attrzp->z_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 
 		if (mask & AT_UID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			zp->z_uid = new_uid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				attrzp->z_uid = new_uid;
 			}
 		}
 
 		if (mask & AT_GID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			zp->z_gid = new_gid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				attrzp->z_gid = new_gid;
 			}
 		}
 		if (!(mask & AT_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT(err == 0);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
 			ASSERT(err == 0);
 		}
 	}
 
 	if (mask & AT_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = new_mode;
 		ASSERT3U((uintptr_t)aclp, !=, 0);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 
 	if (mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &zp->z_atime, sizeof (zp->z_atime));
 	}
 
 	if (mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 		    NULL, mtime, sizeof (mtime));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 	} else if (mask != 0) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
 		    B_TRUE);
 		if (attrzp) {
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_CTIME(zfsvfs), NULL,
 			    &ctime, sizeof (ctime));
 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
 			    mtime, ctime, B_TRUE);
 		}
 	}
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
 
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(vp->v_type == VREG);
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	mutex_exit(&zp->z_lock);
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 		mutex_exit(&attrzp->z_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT(err2 == 0);
 	}
 
 	if (attrzp)
 		VN_RELE(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 		if (err == ERESTART)
 			goto top;
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 	}
 
 out2:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 typedef struct zfs_zlock {
 	krwlock_t	*zl_rwlock;	/* lock we acquired */
 	znode_t		*zl_znode;	/* znode we held */
 	struct zfs_zlock *zl_next;	/* next in list */
 } zfs_zlock_t;
 
 /*
  * Drop locks and release vnodes that were held by zfs_rename_lock().
  */
 static void
 zfs_rename_unlock(zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t *zl;
 
 	while ((zl = *zlpp) != NULL) {
 		if (zl->zl_znode != NULL)
 			VN_RELE(ZTOV(zl->zl_znode));
 		rw_exit(zl->zl_rwlock);
 		*zlpp = zl->zl_next;
 		kmem_free(zl, sizeof (*zl));
 	}
 }
 
 /*
  * Search back through the directory tree, using the ".." entries.
  * Lock each directory in the chain to prevent concurrent renames.
  * Fail any attempt to move a directory into one of its own descendants.
  * XXX - z_parent_lock can overlap with map or grow locks
  */
 static int
 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t	*zl;
 	znode_t		*zp = tdzp;
 	uint64_t	rootid = zp->z_zfsvfs->z_root;
 	uint64_t	oidp = zp->z_id;
 	krwlock_t	*rwlp = &szp->z_parent_lock;
 	krw_t		rw = RW_WRITER;
 
 	/*
 	 * First pass write-locks szp and compares to zp->z_id.
 	 * Later passes read-lock zp and compare to zp->z_parent.
 	 */
 	do {
 		if (!rw_tryenter(rwlp, rw)) {
 			/*
 			 * Another thread is renaming in this path.
 			 * Note that if we are a WRITER, we don't have any
 			 * parent_locks held yet.
 			 */
 			if (rw == RW_READER && zp->z_id > szp->z_id) {
 				/*
 				 * Drop our locks and restart
 				 */
 				zfs_rename_unlock(&zl);
 				*zlpp = NULL;
 				zp = tdzp;
 				oidp = zp->z_id;
 				rwlp = &szp->z_parent_lock;
 				rw = RW_WRITER;
 				continue;
 			} else {
 				/*
 				 * Wait for other thread to drop its locks
 				 */
 				rw_enter(rwlp, rw);
 			}
 		}
 
 		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
 		zl->zl_rwlock = rwlp;
 		zl->zl_znode = NULL;
 		zl->zl_next = *zlpp;
 		*zlpp = zl;
 
 		if (oidp == szp->z_id)		/* We're a descendant of szp */
 			return (SET_ERROR(EINVAL));
 
 		if (oidp == rootid)		/* We've hit the top */
 			return (0);
 
 		if (rw == RW_READER) {		/* i.e. not the first pass */
 			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
 			if (error)
 				return (error);
 			zl->zl_znode = zp;
 		}
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
 		    &oidp, sizeof (oidp));
 		rwlp = &zp->z_parent_lock;
 		rw = RW_READER;
 
 	} while (zp->z_id != sdzp->z_id);
 
 	return (0);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*tdzp, *sdzp, *szp, *tzp;
 	zfsvfs_t 	*zfsvfs;
 	zilog_t		*zilog;
 	vnode_t		*realvp;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
 	int		cmp, serr, terr;
 	int		error = 0;
 	int		zflg = 0;
 	boolean_t	waited = B_FALSE;
 
 	tdzp = VTOZ(tdvp);
 	ZFS_VERIFY_ZP(tdzp);
 	zfsvfs = tdzp->z_zfsvfs;
 	ZFS_ENTER(zfsvfs);
 	zilog = zfsvfs->z_log;
 	sdzp = VTOZ(sdvp);
 
 	/*
 	 * In case sdzp is not valid, let's be sure to exit from the right
 	 * zfsvfs_t.
 	 */
 	if (sdzp->z_sa_hdl == NULL) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
 	 * ctldir appear to have the same v_vfsp.
 	 */
 	if (sdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EXDEV));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 top:
 	szp = NULL;
 	tzp = NULL;
 	zl = NULL;
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Lock source and target directory entries.  To prevent deadlock,
 	 * a lock ordering must be defined.  We lock the directory with
 	 * the smallest object id first, or if it's a tie, the one with
 	 * the lexically first name.
 	 */
 	if (sdzp->z_id < tdzp->z_id) {
 		cmp = -1;
 	} else if (sdzp->z_id > tdzp->z_id) {
 		cmp = 1;
 	} else {
 		/*
 		 * First compare the two name arguments without
 		 * considering any case folding.
 		 */
 		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
 
 		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
 		ASSERT(error == 0 || !zfsvfs->z_utf8);
 		if (cmp == 0) {
 			/*
 			 * POSIX: "If the old argument and the new argument
 			 * both refer to links to the same existing file,
 			 * the rename() function shall return successfully
 			 * and perform no other action."
 			 */
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 		/*
 		 * If the file system is case-folding, then we may
 		 * have some more checking to do.  A case-folding file
 		 * system is either supporting mixed case sensitivity
 		 * access or is completely case-insensitive.  Note
 		 * that the file system is always case preserving.
 		 *
 		 * In mixed sensitivity mode case sensitive behavior
 		 * is the default.  FIGNORECASE must be used to
 		 * explicitly request case insensitive behavior.
 		 *
 		 * If the source and target names provided differ only
 		 * by case (e.g., a request to rename 'tim' to 'Tim'),
 		 * we will treat this as a special case in the
 		 * case-insensitive mode: as long as the source name
 		 * is an exact match, we will allow this to proceed as
 		 * a name-change request.
 		 */
 		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
 		    flags & FIGNORECASE)) &&
 		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
 		    &error) == 0) {
 			/*
 			 * case preserving rename request, require exact
 			 * name matches
 			 */
 			zflg |= ZCIEXACT;
 			zflg &= ~ZCILOOK;
 		}
 	}
 
 	/*
 	 * If the source and destination directories are the same, we should
 	 * grab the z_name_lock of that directory only once.
 	 */
 	if (sdzp == tdzp) {
 		zflg |= ZHAVELOCK;
 		rw_enter(&sdzp->z_name_lock, RW_READER);
 	}
 
 	if (cmp < 0) {
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
 		    ZEXISTS | zflg, NULL, NULL);
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
 	} else {
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, zflg, NULL, NULL);
 		serr = zfs_dirent_lock(&sdl,
 		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
 		    NULL, NULL);
 	}
 
 	if (serr) {
 		/*
 		 * Source entry invalid or not there.
 		 */
 		if (!terr) {
 			zfs_dirent_unlock(tdl);
 			if (tzp)
 				VN_RELE(ZTOV(tzp));
 		}
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		/*
 		 * FreeBSD: In OpenSolaris they only check if rename source is
 		 * ".." here, because "." is handled in their lookup. This is
 		 * not the case for FreeBSD, so we check for "." explicitly.
 		 */
 		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
 			serr = SET_ERROR(EINVAL);
 		ZFS_EXIT(zfsvfs);
 		return (serr);
 	}
 	if (terr) {
 		zfs_dirent_unlock(sdl);
 		VN_RELE(ZTOV(szp));
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(tnm, "..") == 0)
 			terr = SET_ERROR(EINVAL);
 		ZFS_EXIT(zfsvfs);
 		return (terr);
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
 		goto out;
 
 	if (ZTOV(szp)->v_type == VDIR) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if (ZTOV(szp)->v_type == VDIR) {
 			if (ZTOV(tzp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
 				goto out;
 			}
 		} else {
 			if (ZTOV(tzp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
 				goto out;
 			}
 		}
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
 		 * must do nothing and exit without error.
 		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
 			goto out;
 		}
 	}
 
 	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
 	if (tzp)
 		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
 	 * as source directory.
 	 */
 	if (tdvp != sdvp) {
 		vnevent_rename_dest_dir(tdvp, ct);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
 		zfs_dirent_unlock(sdl);
 		zfs_dirent_unlock(tdl);
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		VN_RELE(ZTOV(szp));
 		if (tzp)
 			VN_RELE(ZTOV(tzp));
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
 			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 			if (error == 0) {
 				zfs_log_rename(zilog, tx, TX_RENAME |
 				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
 				    sdl->dl_name, tdzp, tdl->dl_name, szp);
 
 				/*
 				 * Update path information for the target vnode
 				 */
 				vn_renamepath(tdvp, ZTOV(szp), tnm,
 				    strlen(tnm));
 			} else {
 				/*
 				 * At this point, we have successfully created
 				 * the target name, but have failed to remove
 				 * the source name.  Since the create was done
 				 * with the ZRENAMING flag, there are
 				 * complications; for one, the link count is
 				 * wrong.  The easiest way to deal with this
 				 * is to remove the newly created target, and
 				 * return the original error.  This must
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
 				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
 #ifdef FREEBSD_NAMECACHE
 		if (error == 0) {
 			cache_purge(sdvp);
 			cache_purge(tdvp);
 			cache_purge(ZTOV(szp));
 			if (tzp)
 				cache_purge(ZTOV(tzp));
 		}
 #endif
 	}
 
 	dmu_tx_commit(tx);
 out:
 	if (zl != NULL)
 		zfs_rename_unlock(&zl);
 
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
 	if (sdzp == tdzp)
 		rw_exit(&sdzp->z_name_lock);
 
 
 	VN_RELE(ZTOV(szp));
 	if (tzp)
 		VN_RELE(ZTOV(tzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
     cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 	boolean_t	waited = B_FALSE;
 	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	getnewvnode_reserve(1);
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datsets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
 	mutex_exit(&zp->z_lock);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	*vpp = ZTOV(zp);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	zfs_dirent_unlock(dl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- structure containing the link path.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 	mutex_exit(&zp->z_lock);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 /* ARGSUSED */
 static int
 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	vnode_t		*realvp;
 	int		error;
 	int		zf = ZNEW;
 	uint64_t	parent;
 	uid_t		owner;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(tdvp->v_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (VOP_REALVP(svp, &realvp, ct) == 0)
 		svp = realvp;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (svp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	szp = VTOZ(svp);
 	ZFS_VERIFY_ZP(szp);
 
 	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
 	 * ctldir appear to have the same v_vfsp.
 	 */
 	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 
 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_create(dl, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 #ifdef illumos
 /*
  * zfs_null_putapage() is used when the file system has been force
  * unmounted. It just drops the pages.
  */
 /* ARGSUSED */
 static int
 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
     size_t *lenp, int flags, cred_t *cr)
 {
 	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
 	return (0);
 }
 
 /*
  * Push a page out to disk, klustering if possible.
  *
  *	IN:	vp	- file to push page to.
  *		pp	- page to push.
  *		flags	- additional flags.
  *		cr	- credentials of caller.
  *
  *	OUT:	offp	- start of range pushed.
  *		lenp	- len of range pushed.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * NOTE: callers must have locked the page to be pushed.  On
  * exit, the page (and all other pages in the kluster) must be
  * unlocked.
  */
 /* ARGSUSED */
 static int
 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
     size_t *lenp, int flags, cred_t *cr)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	dmu_tx_t	*tx;
 	u_offset_t	off, koff;
 	size_t		len, klen;
 	int		err;
 
 	off = pp->p_offset;
 	len = PAGESIZE;
 	/*
 	 * If our blocksize is bigger than the page size, try to kluster
 	 * multiple pages so that we write a full block (thus avoiding
 	 * a read-modify-write).
 	 */
 	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
 		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
 		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
 		ASSERT(koff <= zp->z_size);
 		if (koff + klen > zp->z_size)
 			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
 		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
 	}
 	ASSERT3U(btop(len), ==, btopr(len));
 
 	/*
 	 * Can't push pages past end-of-file.
 	 */
 	if (off >= zp->z_size) {
 		/* ignore all pages */
 		err = 0;
 		goto out;
 	} else if (off + len > zp->z_size) {
 		int npages = btopr(zp->z_size - off);
 		page_t *trunc;
 
 		page_list_break(&pp, &trunc, npages);
 		/* ignore pages past end of file */
 		if (trunc)
 			pvn_write_done(trunc, flags);
 		len = zp->z_size - off;
 	}
 
 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 		err = SET_ERROR(EDQUOT);
 		goto out;
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	if (zp->z_blksz <= PAGESIZE) {
 		caddr_t va = zfs_map_page(pp, S_READ);
 		ASSERT3U(len, <=, PAGESIZE);
 		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
 		zfs_unmap_page(pp, va);
 	} else {
 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
 	}
 
 	if (err == 0) {
 		uint64_t mtime[2], ctime[2];
 		sa_bulk_attr_t bulk[3];
 		int count = 0;
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 	}
 	dmu_tx_commit(tx);
 
 out:
 	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
 	if (offp)
 		*offp = off;
 	if (lenp)
 		*lenp = len;
 
 	return (err);
 }
 
 /*
  * Copy the portion of the file indicated from pages into the file.
  * The pages are stored in a page list attached to the files vnode.
  *
  *	IN:	vp	- vnode of file to push page data to.
  *		off	- position in file to put data.
  *		len	- amount of data to write.
  *		flags	- flags to control the operation.
  *		cr	- credentials of caller.
  *		ct	- caller context.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	page_t		*pp;
 	size_t		io_len;
 	u_offset_t	io_off;
 	uint_t		blksz;
 	rl_t		*rl;
 	int		error = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * Align this request to the file block size in case we kluster.
 	 * XXX - this can result in pretty aggresive locking, which can
 	 * impact simultanious read/write access.  One option might be
 	 * to break up long requests (len == 0) into block-by-block
 	 * operations to get narrower locking.
 	 */
 	blksz = zp->z_blksz;
 	if (ISP2(blksz))
 		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
 	else
 		io_off = 0;
 	if (len > 0 && ISP2(blksz))
 		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
 	else
 		io_len = 0;
 
 	if (io_len == 0) {
 		/*
 		 * Search the entire vp list for pages >= io_off.
 		 */
 		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
 		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
 		goto out;
 	}
 	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
 
 	if (off > zp->z_size) {
 		/* past end of file */
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
 
 	for (off = io_off; io_off < off + len; io_off += io_len) {
 		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
 			pp = page_lookup(vp, io_off,
 			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
 		} else {
 			pp = page_lookup_nowait(vp, io_off,
 			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
 		}
 
 		if (pp != NULL && pvn_getdirty(pp, flags)) {
 			int err;
 
 			/*
 			 * Found a dirty page to push
 			 */
 			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
 			if (err)
 				error = err;
 		} else {
 			io_len = PAGESIZE;
 		}
 	}
 out:
 	zfs_range_unlock(rl);
 	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 #endif	/* illumos */
 
 /*ARGSUSED*/
 void
 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL) {
 		/*
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
 		mutex_exit(&zp->z_lock);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 	mutex_exit(&zp->z_lock);
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
 			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 #ifdef illumos
 /*
  * Bounds-check the seek operation.
  *
  *	IN:	vp	- vnode seeking within
  *		ooff	- old file offset
  *		noffp	- pointer to new file offset
  *		ct	- caller context
  *
  *	RETURN:	0 on success, EINVAL if new offset invalid.
  */
 /* ARGSUSED */
 static int
 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
     caller_context_t *ct)
 {
 	if (vp->v_type == VDIR)
 		return (0);
 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
 }
 
 /*
  * Pre-filter the generic locking function to trap attempts to place
  * a mandatory lock on a memory mapped file.
  */
 static int
 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
     flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * We are following the UFS semantics with respect to mapcnt
 	 * here: If we see that the file is mapped already, then we will
 	 * return an error, but we don't worry about races between this
 	 * function and zfs_map().
 	 */
 	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EAGAIN));
 	}
 	ZFS_EXIT(zfsvfs);
 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
 }
 
 /*
  * If we can't find a page in the cache, we will create a new page
  * and fill it with file data.  For efficiency, we may try to fill
  * multiple pages at once (klustering) to fill up the supplied page
  * list.  Note that the pages to be filled are held with an exclusive
  * lock to prevent access by other threads while they are being filled.
  */
 static int
 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
     caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
 {
 	znode_t *zp = VTOZ(vp);
 	page_t *pp, *cur_pp;
 	objset_t *os = zp->z_zfsvfs->z_os;
 	u_offset_t io_off, total;
 	size_t io_len;
 	int err;
 
 	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
 		/*
 		 * We only have a single page, don't bother klustering
 		 */
 		io_off = off;
 		io_len = PAGESIZE;
 		pp = page_create_va(vp, io_off, io_len,
 		    PG_EXCL | PG_WAIT, seg, addr);
 	} else {
 		/*
 		 * Try to find enough pages to fill the page list
 		 */
 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
 		    &io_len, off, plsz, 0);
 	}
 	if (pp == NULL) {
 		/*
 		 * The page already exists, nothing to do here.
 		 */
 		*pl = NULL;
 		return (0);
 	}
 
 	/*
 	 * Fill the pages in the kluster.
 	 */
 	cur_pp = pp;
 	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
 		caddr_t va;
 
 		ASSERT3U(io_off, ==, cur_pp->p_offset);
 		va = zfs_map_page(cur_pp, S_WRITE);
 		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
 		    DMU_READ_PREFETCH);
 		zfs_unmap_page(cur_pp, va);
 		if (err) {
 			/* On error, toss the entire kluster */
 			pvn_read_done(pp, B_ERROR);
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = SET_ERROR(EIO);
 			return (err);
 		}
 		cur_pp = cur_pp->p_next;
 	}
 
 	/*
 	 * Fill in the page list array from the kluster starting
 	 * from the desired offset `off'.
 	 * NOTE: the page list will always be null terminated.
 	 */
 	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
 	ASSERT(pl == NULL || (*pl)->p_offset == off);
 
 	return (0);
 }
 
 /*
  * Return pointers to the pages for the file region [off, off + len]
  * in the pl array.  If plsz is greater than len, this function may
  * also return page pointers from after the specified region
  * (i.e. the region [off, off + plsz]).  These additional pages are
  * only returned if they are already in the cache, or were created as
  * part of a klustered read.
  *
  *	IN:	vp	- vnode of file to get data from.
  *		off	- position in file to get data from.
  *		len	- amount of data to retrieve.
  *		plsz	- length of provided page list.
  *		seg	- segment to obtain pages for.
  *		addr	- virtual address of fault.
  *		rw	- mode of created pages.
  *		cr	- credentials of caller.
  *		ct	- caller context.
  *
  *	OUT:	protp	- protection mode of created pages.
  *		pl	- list of pages created.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
     enum seg_rw rw, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	page_t		**pl0 = pl;
 	int		err = 0;
 
 	/* we do our own caching, faultahead is unnecessary */
 	if (pl == NULL)
 		return (0);
 	else if (len > plsz)
 		len = plsz;
 	else
 		len = P2ROUNDUP(len, PAGESIZE);
 	ASSERT(plsz >= len);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (protp)
 		*protp = PROT_ALL;
 
 	/*
 	 * Loop through the requested range [off, off + len) looking
 	 * for pages.  If we don't find a page, we will need to create
 	 * a new page and fill it with data from the file.
 	 */
 	while (len > 0) {
 		if (*pl = page_lookup(vp, off, SE_SHARED))
 			*(pl+1) = NULL;
 		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
 			goto out;
 		while (*pl) {
 			ASSERT3U((*pl)->p_offset, ==, off);
 			off += PAGESIZE;
 			addr += PAGESIZE;
 			if (len > 0) {
 				ASSERT3U(len, >=, PAGESIZE);
 				len -= PAGESIZE;
 			}
 			ASSERT3U(plsz, >=, PAGESIZE);
 			plsz -= PAGESIZE;
 			pl++;
 		}
 	}
 
 	/*
 	 * Fill out the page array with any pages already in the cache.
 	 */
 	while (plsz > 0 &&
 	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
 			off += PAGESIZE;
 			plsz -= PAGESIZE;
 	}
 out:
 	if (err) {
 		/*
 		 * Release any pages we have previously locked.
 		 */
 		while (pl > pl0)
 			page_unlock(*--pl);
 	} else {
 		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	}
 
 	*pl = NULL;
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 /*
  * Request a memory map for a section of a file.  This code interacts
  * with common code and the VM system as follows:
  *
  * - common code calls mmap(), which ends up in smmap_common()
  * - this calls VOP_MAP(), which takes you into (say) zfs
  * - zfs_map() calls as_map(), passing segvn_create() as the callback
  * - segvn_create() creates the new segment and calls VOP_ADDMAP()
  * - zfs_addmap() updates z_mapcnt
  */
 /*ARGSUSED*/
 static int
 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	segvn_crargs_t	vn_a;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((prot & PROT_WRITE) && (zp->z_pflags &
 	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((prot & (PROT_READ | PROT_EXEC)) &&
 	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EACCES));
 	}
 
 	if (vp->v_flag & VNOMAP) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOSYS));
 	}
 
 	if (off < 0 || len > MAXOFFSET_T - off) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vp->v_type != VREG) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENODEV));
 	}
 
 	/*
 	 * If file is locked, disallow mapping.
 	 */
 	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EAGAIN));
 	}
 
 	as_rangelock(as);
 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
 	if (error != 0) {
 		as_rangeunlock(as);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vn_a.vp = vp;
 	vn_a.offset = (u_offset_t)off;
 	vn_a.type = flags & MAP_TYPE;
 	vn_a.prot = prot;
 	vn_a.maxprot = maxprot;
 	vn_a.cred = cr;
 	vn_a.amp = NULL;
 	vn_a.flags = flags & ~MAP_TYPE;
 	vn_a.szc = 0;
 	vn_a.lgrp_mem_policy_flags = 0;
 
 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
 
 	as_rangeunlock(as);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
     caller_context_t *ct)
 {
 	uint64_t pages = btopr(len);
 
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
 	return (0);
 }
 
 /*
  * The reason we push dirty pages as part of zfs_delmap() is so that we get a
  * more accurate mtime for the associated file.  Since we don't have a way of
  * detecting when the data was actually modified, we have to resort to
  * heuristics.  If an explicit msync() is done, then we mark the mtime when the
  * last page is pushed.  The problem occurs when the msync() call is omitted,
  * which by far the most common case:
  *
  *	open()
  *	mmap()
  *	<modify memory>
  *	munmap()
  *	close()
  *	<time lapse>
  *	putpage() via fsflush
  *
  * If we wait until fsflush to come along, we can have a modification time that
  * is some arbitrary point in the future.  In order to prevent this in the
  * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
  * torn down.
  */
 /* ARGSUSED */
 static int
 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
     caller_context_t *ct)
 {
 	uint64_t pages = btopr(len);
 
 	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 
 	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
 	    vn_has_cached_data(vp))
 		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
 
 	return (0);
 }
 
 /*
  * Free or allocate space in a file.  Currently, this function only
  * supports the `F_FREESP' command.  However, this command is somewhat
  * misnamed, as its functionality includes the ability to allocate as
  * well as free space.
  *
  *	IN:	vp	- vnode of file to free data in.
  *		cmd	- action to take (only F_FREESP supported).
  *		bfp	- section of file to free/alloc.
  *		flag	- current file open mode flags.
  *		offset	- current file offset.
  *		cr	- credentials of caller [UNUSED].
  *		ct	- caller context.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated
  */
 /* ARGSUSED */
 static int
 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
     offset_t offset, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint64_t	off, len;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (cmd != F_FREESP) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 	 * callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	if (error = convoff(vp, bfp, 0, offset)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (bfp->l_len < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	off = bfp->l_start;
 	len = bfp->l_len; /* 0 means from off to end of file */
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 #endif	/* illumos */
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
 
 /*ARGSUSED*/
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 
 #ifdef illumos
 	if (fidp->fid_len < size) {
 		fidp->fid_len = size;
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOSPC));
 	}
 #else
 	fidp->fid_len = size;
 #endif
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
 	zfs_dirlock_t	*dl;
 	int		error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = INT_MAX;
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 #ifdef illumos
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
 		error = zfs_dirent_lock(&dl, zp, "", &xzp,
 		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
 		if (error == 0) {
 			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
 			VN_RELE(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
 			 * same as having zero of them.
 			 */
 			error = 0;
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 
 	case _PC_SATTR_ENABLED:
 	case _PC_SATTR_EXISTS:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 		    (vp->v_type == VREG || vp->v_type == VDIR);
 		return (0);
 
 	case _PC_ACCESS_FILTERING:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
 		    vp->v_type == VDIR;
 		return (0);
 
 	case _PC_ACL_ENABLED:
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
 #endif	/* illumos */
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
 #ifdef illumos
 	case _PC_TIMESTAMP_RESOLUTION:
 		/* nanosecond timestamp resolution */
 		*valp = 1L;
 		return (0);
 #endif
 	case _PC_ACL_EXTENDED:
 		*valp = 0;
 		return (0);
 
 	case _PC_ACL_NFS4:
 		*valp = 1;
 		return (0);
 
 	case _PC_ACL_PATH_MAX:
 		*valp = ACL_MAX_ENTRIES;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*ARGSUSED*/
 static int
 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 int
 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	zilog_t	*zilog = zfsvfs->z_log;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 #ifdef illumos
 /*
  * The smallest read we may consider to loan out an arcbuf.
  * This must be a power of 2.
  */
 int zcr_blksz_min = (1 << 10);	/* 1K */
 /*
  * If set to less than the file block size, allow loaning out of an
  * arcbuf for a partial block read.  This must be a power of 2.
  */
 int zcr_blksz_max = (1 << 17);	/* 128K */
 
 /*ARGSUSED*/
 static int
 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int max_blksz = zfsvfs->z_max_blksz;
 	uio_t *uio = &xuio->xu_uio;
 	ssize_t size = uio->uio_resid;
 	offset_t offset = uio->uio_loffset;
 	int blksz;
 	int fullblk, i;
 	arc_buf_t *abuf;
 	ssize_t maxsize;
 	int preamble, postamble;
 
 	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	switch (ioflag) {
 	case UIO_WRITE:
 		/*
 		 * Loan out an arc_buf for write if write size is bigger than
 		 * max_blksz, and the file's block size is also max_blksz.
 		 */
 		blksz = max_blksz;
 		if (size < blksz || zp->z_blksz != blksz) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 		/*
 		 * Caller requests buffers for write before knowing where the
 		 * write offset might be (e.g. NFS TCP write).
 		 */
 		if (offset == -1) {
 			preamble = 0;
 		} else {
 			preamble = P2PHASE(offset, blksz);
 			if (preamble) {
 				preamble = blksz - preamble;
 				size -= preamble;
 			}
 		}
 
 		postamble = P2PHASE(size, blksz);
 		size -= postamble;
 
 		fullblk = size / blksz;
 		(void) dmu_xuio_init(xuio,
 		    (preamble != 0) + fullblk + (postamble != 0));
 		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
 		    int, postamble, int,
 		    (preamble != 0) + fullblk + (postamble != 0));
 
 		/*
 		 * Have to fix iov base/len for partial buffers.  They
 		 * currently represent full arc_buf's.
 		 */
 		if (preamble) {
 			/* data begins in the middle of the arc_buf */
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf,
 			    blksz - preamble, preamble);
 		}
 
 		for (i = 0; i < fullblk; i++) {
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
 		}
 
 		if (postamble) {
 			/* data ends in the middle of the arc_buf */
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
 		}
 		break;
 	case UIO_READ:
 		/*
 		 * Loan out an arc_buf for read if the read size is larger than
 		 * the current file block size.  Block alignment is not
 		 * considered.  Partial arc_buf will be loaned out for read.
 		 */
 		blksz = zp->z_blksz;
 		if (blksz < zcr_blksz_min)
 			blksz = zcr_blksz_min;
 		if (blksz > zcr_blksz_max)
 			blksz = zcr_blksz_max;
 		/* avoid potential complexity of dealing with it */
 		if (blksz > max_blksz) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		maxsize = zp->z_size - uio->uio_loffset;
 		if (size > maxsize)
 			size = maxsize;
 
 		if (size < blksz || vn_has_cached_data(vp)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 		break;
 	default:
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	uio->uio_extflg = UIO_XUIO;
 	XUIO_XUZC_RW(xuio) = ioflag;
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
 {
 	int i;
 	arc_buf_t *abuf;
 	int ioflag = XUIO_XUZC_RW(xuio);
 
 	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
 
 	i = dmu_xuio_cnt(xuio);
 	while (i-- > 0) {
 		abuf = dmu_xuio_arcbuf(xuio, i);
 		/*
 		 * if abuf == NULL, it must be a write buffer
 		 * that has been returned in zfs_write().
 		 */
 		if (abuf)
 			dmu_return_arcbuf(abuf);
 		ASSERT(abuf || ioflag == UIO_WRITE);
 	}
 
 	dmu_xuio_fini(xuio);
 	return (0);
 }
 
 /*
  * Predeclare these here so that the compiler assumes that
  * this is an "old style" function declaration that does
  * not include arguments => we won't get type mismatch errors
  * in the initializations that follow.
  */
 static int zfs_inval();
 static int zfs_isdir();
 
 static int
 zfs_inval()
 {
 	return (SET_ERROR(EINVAL));
 }
 
 static int
 zfs_isdir()
 {
 	return (SET_ERROR(EISDIR));
 }
 /*
  * Directory vnode operations template
  */
 vnodeops_t *zfs_dvnodeops;
 const fs_operation_def_t zfs_dvnodeops_template[] = {
 	VOPNAME_OPEN,		{ .vop_open = zfs_open },
 	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
 	VOPNAME_READ,		{ .error = zfs_isdir },
 	VOPNAME_WRITE,		{ .error = zfs_isdir },
 	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
 	VOPNAME_CREATE,		{ .vop_create = zfs_create },
 	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
 	VOPNAME_LINK,		{ .vop_link = zfs_link },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
 	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
 	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
 	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
 	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * Regular file vnode operations template
  */
 vnodeops_t *zfs_fvnodeops;
 const fs_operation_def_t zfs_fvnodeops_template[] = {
 	VOPNAME_OPEN,		{ .vop_open = zfs_open },
 	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
 	VOPNAME_READ,		{ .vop_read = zfs_read },
 	VOPNAME_WRITE,		{ .vop_write = zfs_write },
 	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
 	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
 	VOPNAME_SPACE,		{ .vop_space = zfs_space },
 	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
 	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
 	VOPNAME_MAP,		{ .vop_map = zfs_map },
 	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
 	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	VOPNAME_REQZCBUF,	{ .vop_reqzcbuf = zfs_reqzcbuf },
 	VOPNAME_RETZCBUF,	{ .vop_retzcbuf = zfs_retzcbuf },
 	NULL,			NULL
 };
 
 /*
  * Symbolic link vnode operations template
  */
 vnodeops_t *zfs_symvnodeops;
 const fs_operation_def_t zfs_symvnodeops_template[] = {
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * special share hidden files vnode operations template
  */
 vnodeops_t *zfs_sharevnodeops;
 const fs_operation_def_t zfs_sharevnodeops_template[] = {
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * Extended attribute directory vnode operations template
  *
  * This template is identical to the directory vnodes
  * operation template except for restricted operations:
  *	VOP_MKDIR()
  *	VOP_SYMLINK()
  *
  * Note that there are other restrictions embedded in:
  *	zfs_create()	- restrict type to VREG
  *	zfs_link()	- no links into/out of attribute space
  *	zfs_rename()	- no moves into/out of attribute space
  */
 vnodeops_t *zfs_xdvnodeops;
 const fs_operation_def_t zfs_xdvnodeops_template[] = {
 	VOPNAME_OPEN,		{ .vop_open = zfs_open },
 	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
 	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
 	VOPNAME_CREATE,		{ .vop_create = zfs_create },
 	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
 	VOPNAME_LINK,		{ .vop_link = zfs_link },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_MKDIR,		{ .error = zfs_inval },
 	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
 	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
 	VOPNAME_SYMLINK,	{ .error = zfs_inval },
 	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * Error vnode operations template
  */
 vnodeops_t *zfs_evnodeops;
 const fs_operation_def_t zfs_evnodeops_template[] = {
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	NULL,			NULL
 };
 #endif	/* illumos */
 
 static int
 ioflags(int ioflags)
 {
 	int flags = 0;
 
 	if (ioflags & IO_APPEND)
 		flags |= FAPPEND;
 	if (ioflags & IO_NDELAY)
         	flags |= FNONBLOCK;
 	if (ioflags & IO_SYNC)
 		flags |= (FSYNC | FDSYNC | FRSYNC);
 
 	return (flags);
 }
 
 static int
 zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind,
     int *rahead)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_page_t mlast;
 	vm_object_t object;
 	caddr_t va;
 	struct sf_buf *sf;
 	off_t startoff, endoff;
 	int i, error;
 	vm_pindex_t reqstart, reqend;
 	int lsize, reqsize, size;
 
 	object = m[0]->object;
 	error = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zfs_vmobject_wlock(object);
 	if (m[count - 1]->valid != 0 && --count == 0) {
 		zfs_vmobject_wunlock(object);
 		goto out;
 	}
 
 	mlast = m[count - 1];
 
 	if (IDX_TO_OFF(mlast->pindex) >=
 	    object->un_pager.vnp.vnp_size) {
 		zfs_vmobject_wunlock(object);
 		ZFS_EXIT(zfsvfs);
 		return (zfs_vm_pagerret_bad);
 	}
 
 	PCPU_INC(cnt.v_vnodein);
 	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
 
 	lsize = PAGE_SIZE;
 	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
 		lsize = object->un_pager.vnp.vnp_size -
 		    IDX_TO_OFF(mlast->pindex);
 	zfs_vmobject_wunlock(object);
 
 	for (i = 0; i < count; i++) {
 		size = PAGE_SIZE;
 		if (i == count - 1)
 			size = lsize;
 		va = zfs_map_page(m[i], &sf);
 		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
 		    size, va, DMU_READ_PREFETCH);
 		if (size != PAGE_SIZE)
 			bzero(va + size, PAGE_SIZE - size);
 		zfs_unmap_page(sf);
 		if (error != 0)
 			goto out;
 	}
 
 	zfs_vmobject_wlock(object);
 	for (i = 0; i < count; i++)
 		m[i]->valid = VM_PAGE_BITS_ALL;
 	zfs_vmobject_wunlock(object);
 
 out:
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	if (error == 0) {
 		if (rbehind)
 			*rbehind = 0;
 		if (rahead)
 			*rahead = 0;
 		return (zfs_vm_pagerret_ok);
 	} else
 		return (zfs_vm_pagerret_error);
 }
 
 static int
 zfs_freebsd_getpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int *a_rbehind;
 		int *a_rahead;
 	} */ *ap;
 {
 
 	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead));
 }
 
 static int
 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
     int *rtvals)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	rl_t		*rl;
 	dmu_tx_t	*tx;
 	struct sf_buf	*sf;
 	vm_object_t	object;
 	vm_page_t	m;
 	caddr_t		va;
 	size_t		tocopy;
 	size_t		lo_len;
 	vm_ooffset_t	lo_off;
 	vm_ooffset_t	off;
 	uint_t		blksz;
 	int		ncount;
 	int		pcount;
 	int		err;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	object = vp->v_object;
 	pcount = btoc(len);
 	ncount = pcount;
 
 	KASSERT(ma[0]->object == object, ("mismatching object"));
 	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
 
 	for (i = 0; i < pcount; i++)
 		rtvals[i] = zfs_vm_pagerret_error;
 
 	off = IDX_TO_OFF(ma[0]->pindex);
 	blksz = zp->z_blksz;
 	lo_off = rounddown(off, blksz);
 	lo_len = roundup(len + (off - lo_off), blksz);
 	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
 
 	zfs_vmobject_wlock(object);
 	if (len + off > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > off) {
 			int pgoff;
 
 			len = object->un_pager.vnp.vnp_size - off;
 			ncount = btoc(len);
 			if ((pgoff = (int)len & PAGE_MASK) != 0) {
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
 				 * field cannot be concurrently changed by a
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 				    ("zfs_putpages: page %p is not read-only", m));
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
 		} else {
 			len = 0;
 			ncount = 0;
 		}
 		if (ncount < pcount) {
 			for (i = ncount; i < pcount; i++) {
 				rtvals[i] = zfs_vm_pagerret_bad;
 			}
 		}
 	}
 	zfs_vmobject_wunlock(object);
 
 	if (ncount == 0)
 		goto out;
 
 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 		goto out;
 	}
 
 top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	err = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (err != 0) {
 		if (err == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	if (zp->z_blksz < PAGE_SIZE) {
 		i = 0;
 		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
 			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
 			va = zfs_map_page(ma[i], &sf);
 			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
 			zfs_unmap_page(sf);
 		}
 	} else {
 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
 	}
 
 	if (err == 0) {
 		uint64_t mtime[2], ctime[2];
 		sa_bulk_attr_t bulk[3];
 		int count = 0;
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 
 		zfs_vmobject_wlock(object);
 		for (i = 0; i < ncount; i++) {
 			rtvals[i] = zfs_vm_pagerret_ok;
 			vm_page_undirty(ma[i]);
 		}
 		zfs_vmobject_wunlock(object);
 		PCPU_INC(cnt.v_vnodeout);
 		PCPU_ADD(cnt.v_vnodepgsout, ncount);
 	}
 	dmu_tx_commit(tx);
 
 out:
 	zfs_range_unlock(rl);
 	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (rtvals[0]);
 }
 
 int
 zfs_freebsd_putpages(ap)
 	struct vop_putpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_sync;
 		int *a_rtvals;
 	} */ *ap;
 {
 
 	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
 	    ap->a_rtvals));
 }
 
 static int
 zfs_freebsd_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn;
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 
 	return (0);
 }
 
 static int
 zfs_freebsd_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	int error;
 
 	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_size, ap->a_td);
 	return (error);
 }
 
 static int
 zfs_freebsd_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int a_fflag;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
 	    ap->a_fflag, ap->a_cred, NULL, NULL));
 }
 
 static int
 zfs_freebsd_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	accmode_t accmode;
 	int error = 0;
 
 	/*
 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
 	 */
 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
 	if (accmode != 0)
 		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
 
 	/*
 	 * VADMIN has to be handled by vaccess().
 	 */
 	if (error == 0) {
 		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
 		if (accmode != 0) {
 			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
 			    zp->z_gid, accmode, ap->a_cred, NULL);
 		}
 	}
 
 	/*
 	 * For VEXEC, ensure that at least one execute bit is set for
 	 * non-directories.
 	 */
 	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
 	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
 		error = EACCES;
 	}
 
 	return (error);
 }
 
 static int
 zfs_freebsd_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 
 	ASSERT(cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, cnp->cn_thread, 0));
 }
 
 static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	int error, mode;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 
 	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
 #ifdef FREEBSD_NAMECACHE
 	if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
 #endif
 	return (error);
 }
 
 static int
 zfs_freebsd_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_cred, NULL, 0));
 }
 
 static int
 zfs_freebsd_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
 	    ap->a_cnp->cn_cred, NULL, 0, NULL));
 }
 
 static int
 zfs_freebsd_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
 }
 
 static int
 zfs_freebsd_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 
 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
 	    ap->a_ncookies, ap->a_cookies));
 }
 
 static int
 zfs_freebsd_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vop_stdfsync(ap);
 	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
 }
 
 static int
 zfs_freebsd_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 	xvattr_t xvap;
 	u_long fflags = 0;
 	int error;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 	xvap.xva_vattr.va_mask |= AT_XVATTR;
 
 	/* Convert chflags into ZFS-type flags. */
 	/* XXX: what about SF_SETTABLE?. */
 	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
 	XVA_SET_REQ(&xvap, XAT_NODUMP);
 	XVA_SET_REQ(&xvap, XAT_READONLY);
 	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
 	XVA_SET_REQ(&xvap, XAT_SYSTEM);
 	XVA_SET_REQ(&xvap, XAT_HIDDEN);
 	XVA_SET_REQ(&xvap, XAT_REPARSE);
 	XVA_SET_REQ(&xvap, XAT_OFFLINE);
 	XVA_SET_REQ(&xvap, XAT_SPARSE);
 
 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
 	if (error != 0)
 		return (error);
 
 	/* Convert ZFS xattr into chflags. */
 #define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
 	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
 		fflags |= (fflag);					\
 } while (0)
 	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
 	    xvap.xva_xoptattrs.xoa_immutable);
 	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
 	    xvap.xva_xoptattrs.xoa_appendonly);
 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
 	    xvap.xva_xoptattrs.xoa_nounlink);
 	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
 	    xvap.xva_xoptattrs.xoa_archive);
 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
 	    xvap.xva_xoptattrs.xoa_nodump);
 	FLAG_CHECK(UF_READONLY, XAT_READONLY,
 	    xvap.xva_xoptattrs.xoa_readonly);
 	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
 	    xvap.xva_xoptattrs.xoa_system);
 	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
 	    xvap.xva_xoptattrs.xoa_hidden);
 	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
 	    xvap.xva_xoptattrs.xoa_reparse);
 	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
 	    xvap.xva_xoptattrs.xoa_offline);
 	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
 	    xvap.xva_xoptattrs.xoa_sparse);
 
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
 	return (0);
 }
 
 static int
 zfs_freebsd_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	cred_t *cred = ap->a_cred;
 	xvattr_t xvap;
 	u_long fflags;
 	uint64_t zflags;
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 
 	zflags = VTOZ(vp)->z_pflags;
 
 	if (vap->va_flags != VNOVAL) {
 		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
 		int error;
 
 		if (zfsvfs->z_use_fuids == B_FALSE)
 			return (EOPNOTSUPP);
 
 		fflags = vap->va_flags;
 		/*
 		 * XXX KDM 
 		 * We need to figure out whether it makes sense to allow
 		 * UF_REPARSE through, since we don't really have other
 		 * facilities to handle reparse points and zfs_setattr()
 		 * doesn't currently allow setting that attribute anyway.
 		 */
 		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
 		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
 		     UF_OFFLINE|UF_SPARSE)) != 0)
 			return (EOPNOTSUPP);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the security.jail.chflags_allowed sysctl is
 		 * is non-zero; otherwise, they behave like unprivileged
 		 * processes.
 		 */
 		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
 		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				error = securelevel_gt(cred, 0);
 				if (error != 0)
 					return (error);
 			}
 		} else {
 			/*
 			 * Callers may only modify the file flags on objects they
 			 * have VADMIN rights for.
 			 */
 			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
 				return (error);
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				return (EPERM);
 			}
 			if (fflags &
 			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
 				return (EPERM);
 			}
 		}
 
 #define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
 	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
 	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
 		XVA_SET_REQ(&xvap, (xflag));				\
 		(xfield) = ((fflags & (fflag)) != 0);			\
 	}								\
 } while (0)
 		/* Convert chflags into ZFS-type flags. */
 		/* XXX: what about SF_SETTABLE?. */
 		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
 		    xvap.xva_xoptattrs.xoa_immutable);
 		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
 		    xvap.xva_xoptattrs.xoa_appendonly);
 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
 		    xvap.xva_xoptattrs.xoa_nounlink);
 		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
 		    xvap.xva_xoptattrs.xoa_archive);
 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
 		    xvap.xva_xoptattrs.xoa_nodump);
 		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
 		    xvap.xva_xoptattrs.xoa_readonly);
 		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
 		    xvap.xva_xoptattrs.xoa_system);
 		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
 		    xvap.xva_xoptattrs.xoa_hidden);
 		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
 		    xvap.xva_xoptattrs.xoa_hidden);
 		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
 		    xvap.xva_xoptattrs.xoa_offline);
 		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
 		    xvap.xva_xoptattrs.xoa_sparse);
 #undef	FLAG_CHANGE
 	}
 	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
 }
 
 static int
 zfs_freebsd_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
 	int error;
 
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fdvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fdvp->v_mount != tvp->v_mount)))
 		error = EXDEV;
 	else
 		error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
 		    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
 	if (tdvp == tvp)
 		VN_RELE(tdvp);
 	else
 		VN_URELE(tdvp);
 	if (tvp)
 		VN_URELE(tvp);
 	VN_RELE(fdvp);
 	VN_RELE(fvp);
 
 	return (error);
 }
 
 static int
 zfs_freebsd_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 
 	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
 	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vnode_t *vp = ap->a_vp;
 	vnode_t *tdvp = ap->a_tdvp;
 
 	if (tdvp->v_mount != vp->v_mount)
 		return (EXDEV);
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
 }
 
 static int
 zfs_freebsd_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
 	return (0);
 }
 
 static int
 zfs_freebsd_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT(zp != NULL);
 
 	/* Destroy the vm object and flush associated pages. */
 	vnode_destroy_vobject(vp);
 
 	/*
 	 * z_teardown_inactive_lock protects from a race with
 	 * zfs_znode_dmu_fini in zfsvfs_teardown during
 	 * force unmount.
 	 */
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL)
 		zfs_znode_free(zp);
 	else
 		zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 
 	vp->v_data = NULL;
 	return (0);
 }
 
 static int
 zfs_freebsd_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 
 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
 }
 
 static int
 zfs_freebsd_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 	ulong_t val;
 	int error;
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
 	if (error == 0)
 		*ap->a_retval = val;
 	else if (error == EOPNOTSUPP)
 		error = vop_stdpathconf(ap);
 	return (error);
 }
 
 static int
 zfs_freebsd_fifo_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_ACL_EXTENDED:
 	case _PC_ACL_NFS4:
 	case _PC_ACL_PATH_MAX:
 	case _PC_MAC_PRESENT:
 		return (zfs_freebsd_pathconf(ap));
 	default:
 		return (fifo_specops.vop_pathconf(ap));
 	}
 }
 
 /*
  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
  * extended attribute name:
  *
  *	NAMESPACE	PREFIX	
  *	system		freebsd:system:
  *	user		(none, can be used to access ZFS fsattr(5) attributes
  *			created on Solaris)
  */
 static int
 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
     size_t size)
 {
 	const char *namespace, *prefix, *suffix;
 
 	/* We don't allow '/' character in attribute name. */
 	if (strchr(name, '/') != NULL)
 		return (EINVAL);
 	/* We don't allow attribute names that start with "freebsd:" string. */
 	if (strncmp(name, "freebsd:", 8) == 0)
 		return (EINVAL);
 
 	bzero(attrname, size);
 
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_USER:
 #if 0
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_USER_STRING;
 		suffix = ":";
 #else
 		/*
 		 * This is the default namespace by which we can access all
 		 * attributes created on Solaris.
 		 */
 		prefix = namespace = suffix = "";
 #endif
 		break;
 	case EXTATTR_NAMESPACE_SYSTEM:
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
 		suffix = ":";
 		break;
 	case EXTATTR_NAMESPACE_EMPTY:
 	default:
 		return (EINVAL);
 	}
 	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
 	    name) >= size) {
 		return (ENAMETOOLONG);
 	}
 	return (0);
 }
 
 /*
  * Vnode operating to retrieve a named extended attribute.
  */
 static int
 zfs_getextattr(struct vop_getextattr_args *ap)
 /*
 vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FREAD;
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 
 	if (ap->a_size != NULL) {
 		error = VOP_GETATTR(vp, &va, ap->a_cred);
 		if (error == 0)
 			*ap->a_size = (size_t)va.va_size;
 	} else if (ap->a_uio != NULL)
 		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 int
 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
 /*
 vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 zfs_setextattr(struct vop_setextattr_args *ap)
 /*
 vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR | CREATE_XATTR_DIR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FFLAGS(O_WRONLY | O_CREAT);
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	VATTR_NULL(&va);
 	va.va_size = 0;
 	error = VOP_SETATTR(vp, &va, ap->a_cred);
 	if (error == 0)
 		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 zfs_listextattr(struct vop_listextattr_args *ap)
 /*
 vop_listextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrprefix[16];
 	u_char dirbuf[sizeof(struct dirent)];
 	struct dirent *dp;
 	struct iovec aiov;
 	struct uio auio, *uio = ap->a_uio;
 	size_t *sizep = ap->a_size;
 	size_t plen;
 	vnode_t *xvp = NULL, *vp;
 	int done, error, eof, pos;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
 	    sizeof(attrprefix));
 	if (error != 0)
 		return (error);
 	plen = strlen(attrprefix);
 
 	ZFS_ENTER(zfsvfs);
 
 	if (sizep != NULL)
 		*sizep = 0;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		/*
 		 * ENOATTR means that the EA directory does not yet exist,
 		 * i.e. there are no extended attributes there.
 		 */
 		if (error == ENOATTR)
 			error = 0;
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = 0;
 
 	do {
 		u_char nlen;
 
 		aiov.iov_base = (void *)dirbuf;
 		aiov.iov_len = sizeof(dirbuf);
 		auio.uio_resid = sizeof(dirbuf);
 		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
 		done = sizeof(dirbuf) - auio.uio_resid;
 		if (error != 0)
 			break;
 		for (pos = 0; pos < done;) {
 			dp = (struct dirent *)(dirbuf + pos);
 			pos += dp->d_reclen;
 			/*
 			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
 			 * is what we get when attribute was created on Solaris.
 			 */
 			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
 				continue;
 			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
 				continue;
 			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
 				continue;
 			nlen = dp->d_namlen - plen;
 			if (sizep != NULL)
 				*sizep += 1 + nlen;
 			else if (uio != NULL) {
 				/*
 				 * Format of extattr name entry is one byte for
 				 * length and the rest for name.
 				 */
 				error = uiomove(&nlen, 1, uio->uio_rw, uio);
 				if (error == 0) {
 					error = uiomove(dp->d_name + plen, nlen,
 					    uio->uio_rw, uio);
 				}
 				if (error != 0)
 					break;
 			}
 		}
 	} while (!eof && error == 0);
 
 	vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 int
 zfs_freebsd_getacl(ap)
 	struct vop_getacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
 	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
 		return (error);
 
 	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
 	if (vsecattr.vsa_aclentp != NULL)
 		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
 
 	return (error);
 }
 
 int
 zfs_freebsd_setacl(ap)
 	struct vop_setacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 	int		aclbsize;	/* size of acl list in bytes */
 	aclent_t	*aaclp;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
 		return (EINVAL);
 
 	/*
 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
 	 * splitting every entry into two and appending "canonical six"
 	 * entries at the end.  Don't allow for setting an ACL that would
 	 * cause chmod(2) to run out of ACL entries.
 	 */
 	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
 		return (ENOSPC);
 
 	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
 	if (error != 0)
 		return (error);
 
 	vsecattr.vsa_mask = VSA_ACE;
 	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
 	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
 	aaclp = vsecattr.vsa_aclentp;
 	vsecattr.vsa_aclentsz = aclbsize;
 
 	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
 	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
 	kmem_free(aaclp, aclbsize);
 
 	return (error);
 }
 
 int
 zfs_freebsd_aclcheck(ap)
 	struct vop_aclcheck_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (EOPNOTSUPP);
 }
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_access =		zfs_freebsd_access,
 #ifdef FREEBSD_NAMECACHE
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_lookup,
 #else
 	.vop_lookup =		zfs_freebsd_lookup,
 #endif
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
 	.vop_mknod =		zfs_freebsd_create,
 	.vop_mkdir =		zfs_freebsd_mkdir,
 	.vop_readdir =		zfs_freebsd_readdir,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_open =		zfs_freebsd_open,
 	.vop_close =		zfs_freebsd_close,
 	.vop_rmdir =		zfs_freebsd_rmdir,
 	.vop_ioctl =		zfs_freebsd_ioctl,
 	.vop_link =		zfs_freebsd_link,
 	.vop_symlink =		zfs_freebsd_symlink,
 	.vop_readlink =		zfs_freebsd_readlink,
 	.vop_read =		zfs_freebsd_read,
 	.vop_write =		zfs_freebsd_write,
 	.vop_remove =		zfs_freebsd_remove,
 	.vop_rename =		zfs_freebsd_rename,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 	.vop_bmap =		zfs_freebsd_bmap,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getextattr =	zfs_getextattr,
 	.vop_deleteextattr =	zfs_deleteextattr,
 	.vop_setextattr =	zfs_setextattr,
 	.vop_listextattr =	zfs_listextattr,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 	.vop_getpages =		zfs_freebsd_getpages,
 	.vop_putpages =		zfs_freebsd_putpages,
 };
 
 struct vop_vector zfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_access =		zfs_freebsd_access,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 };
 
 /*
  * special share hidden files vnode operations template
  */
 struct vop_vector zfs_shareops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		zfs_freebsd_access,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 };
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	(revision 296519)
@@ -1,2198 +1,2199 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
 
 #ifdef _KERNEL
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/mntent.h>
 #include <sys/u8_textprep.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/atomic.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_fuid.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/kidmap.h>
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
 #include <sys/sa.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_stat.h>
 #include <sys/refcount.h>
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
 /* Used by fstat(1). */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)");
 
 /*
  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
  * turned on when DEBUG is also defined.
  */
 #ifdef	DEBUG
 #define	ZNODE_STATS
 #endif	/* DEBUG */
 
 #ifdef	ZNODE_STATS
 #define	ZNODE_STAT_ADD(stat)			((stat)++)
 #else
 #define	ZNODE_STAT_ADD(stat)			/* nothing */
 #endif	/* ZNODE_STATS */
 
 /*
  * Functions needed for userland (ie: libzpool) are not put under
  * #ifdef_KERNEL; the rest of the functions have dependencies
  * (such as VFS logic) that will not compile easily in userland.
  */
 #ifdef _KERNEL
 /*
  * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
  * be freed before it can be safely accessed.
  */
 krwlock_t zfsvfs_lock;
 
 static kmem_cache_t *znode_cache = NULL;
 
 /*ARGSUSED*/
 static void
 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
 {
 	/*
 	 * We should never drop all dbuf refs without first clearing
 	 * the eviction callback.
 	 */
 	panic("evicting znode %p\n", user_ptr);
 }
 
 extern struct vop_vector zfs_vnodeops;
 extern struct vop_vector zfs_fifoops;
 extern struct vop_vector zfs_shareops;
 
 static int
 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 {
 	znode_t *zp = buf;
 
 	POINTER_INVALIDATE(&zp->z_zfsvfs);
 
 	list_link_init(&zp->z_link_node);
 
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zp->z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 
 	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
 	zp->z_vnode = NULL;
 	zp->z_moved = 0;
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 zfs_znode_cache_destructor(void *buf, void *arg)
 {
 	znode_t *zp = buf;
 
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 	ASSERT(ZTOV(zp) == NULL);
 	vn_free(ZTOV(zp));
 	ASSERT(!list_link_active(&zp->z_link_node));
 	mutex_destroy(&zp->z_lock);
 	rw_destroy(&zp->z_parent_lock);
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	avl_destroy(&zp->z_range_avl);
 	mutex_destroy(&zp->z_range_lock);
 
 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 }
 
 #ifdef	ZNODE_STATS
 static struct {
 	uint64_t zms_zfsvfs_invalid;
 	uint64_t zms_zfsvfs_recheck1;
 	uint64_t zms_zfsvfs_unmounted;
 	uint64_t zms_zfsvfs_recheck2;
 	uint64_t zms_obj_held;
 	uint64_t zms_vnode_locked;
 	uint64_t zms_not_only_dnlc;
 } znode_move_stats;
 #endif	/* ZNODE_STATS */
 
 #ifdef illumos
 static void
 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
 {
 	vnode_t *vp;
 
 	/* Copy fields. */
 	nzp->z_zfsvfs = ozp->z_zfsvfs;
 
 	/* Swap vnodes. */
 	vp = nzp->z_vnode;
 	nzp->z_vnode = ozp->z_vnode;
 	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
 	ZTOV(ozp)->v_data = ozp;
 	ZTOV(nzp)->v_data = nzp;
 
 	nzp->z_id = ozp->z_id;
 	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
 	ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
 	nzp->z_unlinked = ozp->z_unlinked;
 	nzp->z_atime_dirty = ozp->z_atime_dirty;
 	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
 	nzp->z_blksz = ozp->z_blksz;
 	nzp->z_seq = ozp->z_seq;
 	nzp->z_mapcnt = ozp->z_mapcnt;
 	nzp->z_gen = ozp->z_gen;
 	nzp->z_sync_cnt = ozp->z_sync_cnt;
 	nzp->z_is_sa = ozp->z_is_sa;
 	nzp->z_sa_hdl = ozp->z_sa_hdl;
 	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
 	nzp->z_links = ozp->z_links;
 	nzp->z_size = ozp->z_size;
 	nzp->z_pflags = ozp->z_pflags;
 	nzp->z_uid = ozp->z_uid;
 	nzp->z_gid = ozp->z_gid;
 	nzp->z_mode = ozp->z_mode;
 
 	/*
 	 * Since this is just an idle znode and kmem is already dealing with
 	 * memory pressure, release any cached ACL.
 	 */
 	if (ozp->z_acl_cached) {
 		zfs_acl_free(ozp->z_acl_cached);
 		ozp->z_acl_cached = NULL;
 	}
 
 	sa_set_userp(nzp->z_sa_hdl, nzp);
 
 	/*
 	 * Invalidate the original znode by clearing fields that provide a
 	 * pointer back to the znode. Set the low bit of the vfs pointer to
 	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
 	 * subsequent callback.
 	 */
 	ozp->z_sa_hdl = NULL;
 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
 
 	/*
 	 * Mark the znode.
 	 */
 	nzp->z_moved = 1;
 	ozp->z_moved = (uint8_t)-1;
 }
 
 /*ARGSUSED*/
 static kmem_cbrc_t
 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
 {
 	znode_t *ozp = buf, *nzp = newbuf;
 	zfsvfs_t *zfsvfs;
 	vnode_t *vp;
 
 	/*
 	 * The znode is on the file system's list of known znodes if the vfs
 	 * pointer is valid. We set the low bit of the vfs pointer when freeing
 	 * the znode to invalidate it, and the memory patterns written by kmem
 	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
 	 * created znode sets the vfs pointer last of all to indicate that the
 	 * znode is known and in a valid state to be moved by this function.
 	 */
 	zfsvfs = ozp->z_zfsvfs;
 	if (!POINTER_IS_VALID(zfsvfs)) {
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * Close a small window in which it's possible that the filesystem could
 	 * be unmounted and freed, and zfsvfs, though valid in the previous
 	 * statement, could point to unrelated memory by the time we try to
 	 * prevent the filesystem from being unmounted.
 	 */
 	rw_enter(&zfsvfs_lock, RW_WRITER);
 	if (zfsvfs != ozp->z_zfsvfs) {
 		rw_exit(&zfsvfs_lock);
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * If the znode is still valid, then so is the file system. We know that
 	 * no valid file system can be freed while we hold zfsvfs_lock, so we
 	 * can safely ensure that the filesystem is not and will not be
 	 * unmounted. The next statement is equivalent to ZFS_ENTER().
 	 */
 	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
 	if (zfsvfs->z_unmounted) {
 		ZFS_EXIT(zfsvfs);
 		rw_exit(&zfsvfs_lock);
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 	rw_exit(&zfsvfs_lock);
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	/*
 	 * Recheck the vfs pointer in case the znode was removed just before
 	 * acquiring the lock.
 	 */
 	if (zfsvfs != ozp->z_zfsvfs) {
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * At this point we know that as long as we hold z_znodes_lock, the
 	 * znode cannot be freed and fields within the znode can be safely
 	 * accessed. Now, prevent a race with zfs_zget().
 	 */
 	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
 		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
 		return (KMEM_CBRC_LATER);
 	}
 
 	vp = ZTOV(ozp);
 	if (mutex_tryenter(&vp->v_lock) == 0) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
 		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/* Only move znodes that are referenced _only_ by the DNLC. */
 	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
 		mutex_exit(&vp->v_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
 		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * The znode is known and in a valid state to move. We're holding the
 	 * locks needed to execute the critical section.
 	 */
 	zfs_znode_move_impl(ozp, nzp);
 	mutex_exit(&vp->v_lock);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
 
 	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 	ZFS_EXIT(zfsvfs);
 
 	return (KMEM_CBRC_YES);
 }
 #endif /* illumos */
 
 void
 zfs_znode_init(void)
 {
 	/*
 	 * Initialize zcache
 	 */
 	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
 	ASSERT(znode_cache == NULL);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
 	kmem_cache_set_move(znode_cache, zfs_znode_move);
 }
 
 void
 zfs_znode_fini(void)
 {
 #ifdef illumos
 	/*
 	 * Cleanup vfs & vnode ops
 	 */
 	zfs_remove_op_tables();
 #endif
 
 	/*
 	 * Cleanup zcache
 	 */
 	if (znode_cache)
 		kmem_cache_destroy(znode_cache);
 	znode_cache = NULL;
 	rw_destroy(&zfsvfs_lock);
 }
 
 #ifdef illumos
 struct vnodeops *zfs_dvnodeops;
 struct vnodeops *zfs_fvnodeops;
 struct vnodeops *zfs_symvnodeops;
 struct vnodeops *zfs_xdvnodeops;
 struct vnodeops *zfs_evnodeops;
 struct vnodeops *zfs_sharevnodeops;
 
 void
 zfs_remove_op_tables()
 {
 	/*
 	 * Remove vfs ops
 	 */
 	ASSERT(zfsfstype);
 	(void) vfs_freevfsops_by_type(zfsfstype);
 	zfsfstype = 0;
 
 	/*
 	 * Remove vnode ops
 	 */
 	if (zfs_dvnodeops)
 		vn_freevnodeops(zfs_dvnodeops);
 	if (zfs_fvnodeops)
 		vn_freevnodeops(zfs_fvnodeops);
 	if (zfs_symvnodeops)
 		vn_freevnodeops(zfs_symvnodeops);
 	if (zfs_xdvnodeops)
 		vn_freevnodeops(zfs_xdvnodeops);
 	if (zfs_evnodeops)
 		vn_freevnodeops(zfs_evnodeops);
 	if (zfs_sharevnodeops)
 		vn_freevnodeops(zfs_sharevnodeops);
 
 	zfs_dvnodeops = NULL;
 	zfs_fvnodeops = NULL;
 	zfs_symvnodeops = NULL;
 	zfs_xdvnodeops = NULL;
 	zfs_evnodeops = NULL;
 	zfs_sharevnodeops = NULL;
 }
 
 extern const fs_operation_def_t zfs_dvnodeops_template[];
 extern const fs_operation_def_t zfs_fvnodeops_template[];
 extern const fs_operation_def_t zfs_xdvnodeops_template[];
 extern const fs_operation_def_t zfs_symvnodeops_template[];
 extern const fs_operation_def_t zfs_evnodeops_template[];
 extern const fs_operation_def_t zfs_sharevnodeops_template[];
 
 int
 zfs_create_op_tables()
 {
 	int error;
 
 	/*
 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
 	 * In this case we just return as the ops vectors are already set up.
 	 */
 	if (zfs_dvnodeops)
 		return (0);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
 	    &zfs_dvnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
 	    &zfs_fvnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
 	    &zfs_symvnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
 	    &zfs_xdvnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
 	    &zfs_evnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
 	    &zfs_sharevnodeops);
 
 	return (error);
 }
 #endif	/* illumos */
 
 int
 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 {
 	zfs_acl_ids_t acl_ids;
 	vattr_t vattr;
 	znode_t *sharezp;
 	znode_t *zp;
 	int error;
 
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0555;
 	vattr.va_uid = crgetuid(kcred);
 	vattr.va_gid = crgetgid(kcred);
 
 	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
 	sharezp->z_moved = 0;
 	sharezp->z_unlinked = 0;
 	sharezp->z_atime_dirty = 0;
 	sharezp->z_zfsvfs = zfsvfs;
 	sharezp->z_is_sa = zfsvfs->z_use_sa;
 
 	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
 	    kcred, NULL, &acl_ids));
 	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, sharezp);
 	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
 	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
 	zfsvfs->z_shares_dir = sharezp->z_id;
 
 	zfs_acl_ids_free(&acl_ids);
 	sa_handle_destroy(sharezp->z_sa_hdl);
 	kmem_cache_free(znode_cache, sharezp);
 
 	return (error);
 }
 
 /*
  * define a couple of values we need available
  * for both 64 and 32 bit environments.
  */
 #ifndef NBITSMINOR64
 #define	NBITSMINOR64	32
 #endif
 #ifndef MAXMAJ64
 #define	MAXMAJ64	0xffffffffUL
 #endif
 #ifndef	MAXMIN64
 #define	MAXMIN64	0xffffffffUL
 #endif
 
 /*
  * Create special expldev for ZFS private use.
  * Can't use standard expldev since it doesn't do
  * what we want.  The standard expldev() takes a
  * dev32_t in LP64 and expands it to a long dev_t.
  * We need an interface that takes a dev32_t in ILP32
  * and expands it to a long dev_t.
  */
 static uint64_t
 zfs_expldev(dev_t dev)
 {
 	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
 }
 /*
  * Special cmpldev for ZFS private use.
  * Can't use standard cmpldev since it takes
  * a long dev_t and compresses it to dev32_t in
  * LP64.  We need to do a compaction of a long dev_t
  * to a dev32_t in ILP32.
  */
 dev_t
 zfs_cmpldev(uint64_t dev)
 {
 	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
 }
 
 static void
 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 {
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
 
 	mutex_enter(&zp->z_lock);
 
 	ASSERT(zp->z_sa_hdl == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 	if (sa_hdl == NULL) {
 		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 		    SA_HDL_SHARED, &zp->z_sa_hdl));
 	} else {
 		zp->z_sa_hdl = sa_hdl;
 		sa_set_userp(sa_hdl, zp);
 	}
 
 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 
 	/*
 	 * Slap on VROOT if we are the root znode
 	 */
 	if (zp->z_id == zfsvfs->z_root)
 		ZTOV(zp)->v_flag |= VROOT;
 
 	mutex_exit(&zp->z_lock);
 	vn_exists(ZTOV(zp));
 }
 
 void
 zfs_znode_dmu_fini(znode_t *zp)
 {
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
 	    zp->z_unlinked ||
 	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
 
 	sa_handle_destroy(zp->z_sa_hdl);
 	zp->z_sa_hdl = NULL;
 }
 
 static void
 zfs_vnode_forget(vnode_t *vp)
 {
 
 	/* copied from insmntque_stddtr */
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Construct a new znode/vnode and intialize.
  *
  * This does not do a call to dmu_set_user() that is
  * up to the caller to do, in case you don't want to
  * return the znode
  */
 static znode_t *
 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
     dmu_object_type_t obj_type, sa_handle_t *hdl)
 {
 	znode_t	*zp;
 	vnode_t *vp;
 	uint64_t mode;
 	uint64_t parent;
 	sa_bulk_attr_t bulk[9];
 	int count = 0;
 	int error;
 
 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 
 	KASSERT(curthread->td_vp_reserv > 0,
 	    ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
 	error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
 	if (error != 0) {
 		kmem_cache_free(znode_cache, zp);
 		return (NULL);
 	}
 	zp->z_vnode = vp;
 	vp->v_data = zp;
 
 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 	zp->z_moved = 0;
 
 	/*
 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
 	 * the zfs_znode_move() callback.
 	 */
 	zp->z_sa_hdl = NULL;
 	zp->z_unlinked = 0;
 	zp->z_atime_dirty = 0;
 	zp->z_mapcnt = 0;
 	zp->z_id = db->db_object;
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
 
 	vp = ZTOV(zp);
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 	    &zp->z_links, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 	    &zp->z_atime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 	    &zp->z_uid, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 	    &zp->z_gid, 8);
 
 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
 		if (hdl == NULL)
 			sa_handle_destroy(zp->z_sa_hdl);
 		zfs_vnode_forget(vp);
 		zp->z_vnode = NULL;
 		kmem_cache_free(znode_cache, zp);
 		return (NULL);
 	}
 
 	zp->z_mode = mode;
 
 	vp->v_type = IFTOVT((mode_t)mode);
 
 	switch (vp->v_type) {
 	case VDIR:
 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
 		break;
 #ifdef illumos
 	case VBLK:
 	case VCHR:
 		{
 			uint64_t rdev;
 			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
 			    &rdev, sizeof (rdev)) == 0);
 
 			vp->v_rdev = zfs_cmpldev(rdev);
 		}
 		break;
 #endif
 	case VFIFO:
 #ifdef illumos
 	case VSOCK:
 	case VDOOR:
 #endif
 		vp->v_op = &zfs_fifoops;
 		break;
 	case VREG:
 		if (parent == zfsvfs->z_shares_dir) {
 			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
 			vp->v_op = &zfs_shareops;
 		}
 		break;
 #ifdef illumos
 	case VLNK:
 		vn_setops(vp, zfs_symvnodeops);
 		break;
 	default:
 		vn_setops(vp, zfs_evnodeops);
 		break;
 #endif
 	}
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	membar_producer();
 	/*
 	 * Everything else must be valid before assigning z_zfsvfs makes the
 	 * znode eligible for zfs_znode_move().
 	 */
 	zp->z_zfsvfs = zfsvfs;
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	/*
 	 * Acquire vnode lock before making it available to the world.
 	 */
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VN_LOCK_AREC(vp);
 	if (vp->v_type != VFIFO)
 		VN_LOCK_ASHARE(vp);
 
 	VFS_HOLD(zfsvfs->z_vfs);
 	return (zp);
 }
 
 static uint64_t empty_xattr;
 static uint64_t pad[4];
 static zfs_acl_phys_t acl_phys;
 /*
  * Create a new DMU object to hold a zfs znode.
  *
  *	IN:	dzp	- parent directory for new znode
  *		vap	- file attributes for new znode
  *		tx	- dmu transaction id for zap operations
  *		cr	- credentials of caller
  *		flag	- flags:
  *			  IS_ROOT_NODE	- new object will be root
  *			  IS_XATTR	- new object is an attribute
  *		bonuslen - length of bonus buffer
  *		setaclp  - File/Dir initial ACL
  *		fuidp	 - Tracks fuid allocation.
  *
  *	OUT:	zpp	- allocated znode
  *
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 {
 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
 	uint64_t	mode, size, links, parent, pflags;
 	uint64_t	dzp_pflags = 0;
 	uint64_t	rdev = 0;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	dmu_buf_t	*db;
 	timestruc_t	now;
 	uint64_t	gen, obj;
 	int		err;
 	int		bonuslen;
 	sa_handle_t	*sa_hdl;
 	dmu_object_type_t obj_type;
 	sa_bulk_attr_t	sa_attrs[ZPL_END];
 	int		cnt = 0;
 	zfs_acl_locator_cb_t locate = { 0 };
 
 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
 	if (zfsvfs->z_replay) {
 		obj = vap->va_nodeid;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
 		gen = vap->va_nblocks;		/* ditto */
 	} else {
 		obj = 0;
 		vfs_timestamp(&now);
 		gen = dmu_tx_get_txg(tx);
 	}
 
 	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
 	bonuslen = (obj_type == DMU_OT_SA) ?
 	    DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
 
 	/*
 	 * Create a new DMU object.
 	 */
 	/*
 	 * There's currently no mechanism for pre-reading the blocks that will
 	 * be needed to allocate a new object, so we accept the small chance
 	 * that there will be an i/o error and we will fail one of the
 	 * assertions below.
 	 */
 	if (vap->va_type == VDIR) {
 		if (zfsvfs->z_replay) {
 			VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    obj_type, bonuslen, tx));
 		} else {
 			obj = zap_create_norm(zfsvfs->z_os,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    obj_type, bonuslen, tx);
 		}
 	} else {
 		if (zfsvfs->z_replay) {
 			VERIFY0(dmu_object_claim(zfsvfs->z_os, obj,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    obj_type, bonuslen, tx));
 		} else {
 			obj = dmu_object_alloc(zfsvfs->z_os,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    obj_type, bonuslen, tx);
 		}
 	}
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
 	VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 
 	/*
 	 * If this is the root, fix up the half-initialized parent pointer
 	 * to reference the just-allocated physical data area.
 	 */
 	if (flag & IS_ROOT_NODE) {
 		dzp->z_id = obj;
 	} else {
 		dzp_pflags = dzp->z_pflags;
 	}
 
 	/*
 	 * If parent is an xattr, so am I.
 	 */
 	if (dzp_pflags & ZFS_XATTR) {
 		flag |= IS_XATTR;
 	}
 
 	if (zfsvfs->z_use_fuids)
 		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
 	else
 		pflags = 0;
 
 	if (vap->va_type == VDIR) {
 		size = 2;		/* contents ("." and "..") */
 		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
 	} else {
 		size = links = 0;
 	}
 
 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
 		rdev = zfs_expldev(vap->va_rdev);
 	}
 
 	parent = dzp->z_id;
 	mode = acl_ids->z_mode;
 	if (flag & IS_XATTR)
 		pflags |= ZFS_XATTR;
 
 	/*
 	 * No execs denied will be deterimed when zfs_mode_compute() is called.
 	 */
 	pflags |= acl_ids->z_aclp->z_hints &
 	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
 	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 
 	ZFS_TIME_ENCODE(&now, crtime);
 	ZFS_TIME_ENCODE(&now, ctime);
 
 	if (vap->va_mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, atime);
 	} else {
 		ZFS_TIME_ENCODE(&now, atime);
 	}
 
 	if (vap->va_mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 	} else {
 		ZFS_TIME_ENCODE(&now, mtime);
 	}
 
 	/* Now add in all of the "SA" attributes */
 	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
 	    &sa_hdl));
 
 	/*
 	 * Setup the array of attributes to be replaced/set on the new file
 	 *
 	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
 	 * in the old znode_phys_t format.  Don't change this ordering
 	 */
 
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 		    NULL, &atime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 		    NULL, &mtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 		    NULL, &ctime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 		    NULL, &crtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 		    NULL, &gen, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 		    NULL, &mode, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 		    NULL, &size, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 		    NULL, &parent, 8);
 	} else {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 		    NULL, &mode, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 		    NULL, &size, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 		    NULL, &gen, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 		    &acl_ids->z_fuid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 		    &acl_ids->z_fgid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 		    NULL, &parent, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &pflags, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 		    NULL, &atime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 		    NULL, &mtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 		    NULL, &ctime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 		    NULL, &crtime, 16);
 	}
 
 	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
 		    &empty_xattr, 8);
 	}
 	if (obj_type == DMU_OT_ZNODE ||
 	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
 		    NULL, &rdev, 8);
 
 	}
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &pflags, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 		    &acl_ids->z_fuid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 		    &acl_ids->z_fgid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
 		    sizeof (uint64_t) * 4);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 		    &acl_phys, sizeof (zfs_acl_phys_t));
 	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 		    &acl_ids->z_aclp->z_acl_count, 8);
 		locate.cb_aclp = acl_ids->z_aclp;
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
 		    zfs_acl_data_locator, &locate,
 		    acl_ids->z_aclp->z_acl_bytes);
 		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
 		    acl_ids->z_fuid, acl_ids->z_fgid);
 	}
 
 	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
 
 	if (!(flag & IS_ROOT_NODE)) {
 		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
 		ASSERT(*zpp != NULL);
 	} else {
 		/*
 		 * If we are creating the root node, the "parent" we
 		 * passed in is the znode for the root.
 		 */
 		*zpp = dzp;
 
 		(*zpp)->z_sa_hdl = sa_hdl;
 	}
 
 	(*zpp)->z_pflags = pflags;
 	(*zpp)->z_mode = mode;
 
 	if (vap->va_mask & AT_XVATTR)
 		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
 
 	if (obj_type == DMU_OT_ZNODE ||
 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 	}
 	if (!(flag & IS_ROOT_NODE)) {
 		vnode_t *vp;
 
 		vp = ZTOV(*zpp);
 		vp->v_vflag |= VV_FORCEINSMQ;
 		err = insmntque(vp, zfsvfs->z_vfs);
 		vp->v_vflag &= ~VV_FORCEINSMQ;
 		KASSERT(err == 0, ("insmntque() failed: error %d", err));
 	}
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 }
 
 /*
  * Update in-core attributes.  It is assumed the caller will be doing an
  * sa_bulk_update to push the changes out.
  */
 void
 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 {
 	xoptattr_t *xoap;
 
 	xoap = xva_getxoptattr(xvap);
 	ASSERT(xoap);
 
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 		uint64_t times[2];
 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
 		    &times, sizeof (times), tx);
 		XVA_SET_RTN(xvap, XAT_CREATETIME);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_READONLY);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_HIDDEN);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_SYSTEM);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NODUMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_OPAQUE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
 		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 		zfs_sa_set_scanstamp(zp, xvap, tx);
 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_REPARSE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_OFFLINE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_SPARSE);
 	}
 }
 
 int
 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 {
 	dmu_object_info_t doi;
 	dmu_buf_t	*db;
 	znode_t		*zp;
 	vnode_t		*vp;
 	sa_handle_t	*hdl;
 	struct thread	*td;
 	int locked;
 	int err;
 
 	td = curthread;
 	getnewvnode_reserve(1);
 again:
 	*zpp = NULL;
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		getnewvnode_drop_reserve();
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 #ifdef __FreeBSD__
 		getnewvnode_drop_reserve();
 #endif
 		return (SET_ERROR(EINVAL));
 	}
 
 	hdl = dmu_buf_get_user(db);
 	if (hdl != NULL) {
 		zp  = sa_get_userdata(hdl);
 
 
 		/*
 		 * Since "SA" does immediate eviction we
 		 * should never find a sa handle that doesn't
 		 * know about the znode.
 		 */
 
 		ASSERT3P(zp, !=, NULL);
 
 		mutex_enter(&zp->z_lock);
 		ASSERT3U(zp->z_id, ==, obj_num);
 		if (zp->z_unlinked) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			vp = ZTOV(zp);
 			*zpp = zp;
 			err = 0;
 		}
 
 		/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
 		if (err == 0)
 			VN_HOLD(vp);
 
 		mutex_exit(&zp->z_lock);
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
 		if (err == 0) {
 			locked = VOP_ISLOCKED(vp);
 			VI_LOCK(vp);
 			if ((vp->v_iflag & VI_DOOMED) != 0 &&
 			    locked != LK_EXCLUSIVE) {
 				/*
 				 * The vnode is doomed and this thread doesn't
 				 * hold the exclusive lock on it, so the vnode
 				 * must be being reclaimed by another thread.
 				 * Otherwise the doomed vnode is being reclaimed
 				 * by this thread and zfs_zget is called from
 				 * ZIL internals.
 				 */
 				VI_UNLOCK(vp);
 				VN_RELE(vp);
 				goto again;
 			}
 			VI_UNLOCK(vp);
 		}
 		getnewvnode_drop_reserve();
 		return (err);
 	}
 
 	/*
 	 * Not found create new znode/vnode
 	 * but only if file exists.
 	 *
 	 * There is a small window where zfs_vget() could
 	 * find this object while a file create is still in
 	 * progress.  This is checked for in zfs_znode_alloc()
 	 *
 	 * if zfs_znode_alloc() fails it will drop the hold on the
 	 * bonus buffer.
 	 */
 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
 	    doi.doi_bonus_type, NULL);
 	if (zp == NULL) {
 		err = SET_ERROR(ENOENT);
 	} else {
 		*zpp = zp;
 	}
 	if (err == 0) {
 		vnode_t *vp = ZTOV(zp);
 
 		err = insmntque(vp, zfsvfs->z_vfs);
 		if (err == 0) {
 			vp->v_hash = obj_num;
 			VOP_UNLOCK(vp, 0);
 		} else {
 			zp->z_vnode = NULL;
 			zfs_znode_dmu_fini(zp);
 			zfs_znode_free(zp);
 			*zpp = NULL;
 		}
 	}
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 	getnewvnode_drop_reserve();
 	return (err);
 }
 
 int
 zfs_rezget(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	dmu_object_info_t doi;
 	dmu_buf_t *db;
 	vnode_t *vp;
 	uint64_t obj_num = zp->z_id;
 	uint64_t mode, size;
 	sa_bulk_attr_t bulk[8];
 	int err;
 	int count = 0;
 	uint64_t gen;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
 	mutex_enter(&zp->z_acl_lock);
 	if (zp->z_acl_cached) {
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
 
 	mutex_exit(&zp->z_acl_lock);
 	ASSERT(zp->z_sa_hdl == NULL);
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EINVAL));
 	}
 
 	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
 	size = zp->z_size;
 
 	/* reload cached values */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
 	    &gen, sizeof (gen));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, sizeof (zp->z_size));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 	    &zp->z_links, sizeof (zp->z_links));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 	    &zp->z_atime, sizeof (zp->z_atime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 	    &zp->z_uid, sizeof (zp->z_uid));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 	    &zp->z_gid, sizeof (zp->z_gid));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 	    &mode, sizeof (mode));
 
 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EIO));
 	}
 
 	zp->z_mode = mode;
 
 	if (gen != zp->z_gen) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * It is highly improbable but still quite possible that two
 	 * objects in different datasets are created with the same
 	 * object numbers and in transaction groups with the same
 	 * numbers.  znodes corresponding to those objects would
 	 * have the same z_id and z_gen, but their other attributes
 	 * may be different.
 	 * zfs recv -F may replace one of such objects with the other.
 	 * As a result file properties recorded in the replaced
 	 * object's vnode may no longer match the received object's
 	 * properties.  At present the only cached property is the
 	 * files type recorded in v_type.
 	 * So, handle this case by leaving the old vnode and znode
 	 * disassociated from the actual object.  A new vnode and a
 	 * znode will be created if the object is accessed
 	 * (e.g. via a look-up).  The old vnode and znode will be
 	 * recycled when the last vnode reference is dropped.
 	 */
 	vp = ZTOV(zp);
 	if (vp != NULL && vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (EIO);
 	}
 
 	zp->z_unlinked = (zp->z_links == 0);
 	zp->z_blksz = doi.doi_data_block_size;
 	if (vp != NULL) {
 		vn_pages_remove(vp, 0, 0);
 		if (zp->z_size != size)
 			vnode_pager_setsize(vp, zp->z_size);
 	}
 
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
 	return (0);
 }
 
 void
 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zfsvfs->z_os;
 	uint64_t obj = zp->z_id;
 	uint64_t acl_obj = zfs_external_acl(zp);
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
 	if (acl_obj) {
 		VERIFY(!zp->z_is_sa);
 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
 	}
 	VERIFY(0 == dmu_object_free(os, obj, tx));
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 	zfs_znode_free(zp);
 }
 
 void
 zfs_zinactive(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t z_id = zp->z_id;
 
 	ASSERT(zp->z_sa_hdl);
 
 	/*
 	 * Don't allow a zfs_zget() while were trying to release this znode
 	 */
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
 
 	mutex_enter(&zp->z_lock);
 
 	/*
 	 * If this was the last reference to a file with no links,
 	 * remove the file from the file system.
 	 */
 	if (zp->z_unlinked) {
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		zfs_rmnode(zp);
 		return;
 	}
 
 	mutex_exit(&zp->z_lock);
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 	zfs_znode_free(zp);
 }
 
 void
 zfs_znode_free(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT(zp->z_sa_hdl == NULL);
 	zp->z_vnode = NULL;
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	POINTER_INVALIDATE(&zp->z_zfsvfs);
 	list_remove(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	if (zp->z_acl_cached) {
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
 
 	kmem_cache_free(znode_cache, zp);
 
 	VFS_RELE(zfsvfs->z_vfs);
 }
 
 void
 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
     uint64_t ctime[2], boolean_t have_tx)
 {
 	timestruc_t	now;
 
 	vfs_timestamp(&now);
 
 	if (have_tx) {	/* will sa_bulk_update happen really soon? */
 		zp->z_atime_dirty = 0;
 		zp->z_seq++;
 	} else {
 		zp->z_atime_dirty = 1;
 	}
 
 	if (flag & AT_ATIME) {
 		ZFS_TIME_ENCODE(&now, zp->z_atime);
 	}
 
 	if (flag & AT_MTIME) {
 		ZFS_TIME_ENCODE(&now, mtime);
 		if (zp->z_zfsvfs->z_use_fuids) {
 			zp->z_pflags |= (ZFS_ARCHIVE |
 			    ZFS_AV_MODIFIED);
 		}
 	}
 
 	if (flag & AT_CTIME) {
 		ZFS_TIME_ENCODE(&now, ctime);
 		if (zp->z_zfsvfs->z_use_fuids)
 			zp->z_pflags |= ZFS_ARCHIVE;
 	}
 }
 
 /*
  * Grow the block size for a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		size	- requested block size
  *		tx	- open transaction.
  *
  * NOTE: this function assumes that the znode is write locked.
  */
 void
 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
 {
 	int		error;
 	u_longlong_t	dummy;
 
 	if (size <= zp->z_blksz)
 		return;
 	/*
 	 * If the file size is already greater than the current blocksize,
 	 * we will not grow.  If there is more than one block in a file,
 	 * the blocksize cannot change.
 	 */
 	if (zp->z_blksz && zp->z_size > zp->z_blksz)
 		return;
 
 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
 	    size, 0, tx);
 
 	if (error == ENOTSUP)
 		return;
 	ASSERT0(error);
 
 	/* What blocksize did we actually get? */
 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
 }
 
 #ifdef illumos
 /*
  * This is a dummy interface used when pvn_vplist_dirty() should *not*
  * be calling back into the fs for a putpage().  E.g.: when truncating
  * a file, the pages being "thrown away* don't need to be written out.
  */
 /* ARGSUSED */
 static int
 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
     int flags, cred_t *cr)
 {
 	ASSERT(0);
 	return (0);
 }
 #endif
 
 /*
  * Increase the file length
  *
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_extend(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	dmu_tx_t *tx;
 	rl_t *rl;
 	uint64_t newblksz;
 	int error;
 
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end <= zp->z_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	if (end > zp->z_blksz &&
 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
 		/*
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
 			/*
 			 * File's blocksize is already larger than the
 			 * "recordsize" property.  Only let it grow to
 			 * the next power of 2.
 			 */
 			ASSERT(!ISP2(zp->z_blksz));
 			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
 		} else {
 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
 		}
 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
 	} else {
 		newblksz = 0;
 	}
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
 
 	if (newblksz)
 		zfs_grow_blocksize(zp, newblksz, tx);
 
 	zp->z_size = end;
 
 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx));
 
 	vnode_pager_setsize(ZTOV(zp), end);
 
 	zfs_range_unlock(rl);
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /*
  * Free space in a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of section to free.
  *		len	- length of section to free.
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	rl_t *rl;
 	int error;
 
 	/*
 	 * Lock the range being freed.
 	 */
 	rl = zfs_range_lock(zp, off, len, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (off >= zp->z_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
 
 	if (off + len > zp->z_size)
 		len = zp->z_size - off;
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
 
 	if (error == 0) {
 		/*
 		 * In FreeBSD we cannot free block in the middle of a file,
 		 * but only at the end of a file, so this code path should
 		 * never happen.
 		 */
 		vnode_pager_setsize(ZTOV(zp), off);
 	}
 
 	zfs_range_unlock(rl);
 
 	return (error);
 }
 
 /*
  * Truncate a file
  *
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file.
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_trunc(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	dmu_tx_t *tx;
 	rl_t *rl;
 	int error;
 	sa_bulk_attr_t bulk[2];
 	int count = 0;
 
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end >= zp->z_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
 	if (error) {
 		zfs_range_unlock(rl);
 		return (error);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
 
 	zp->z_size = end;
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
 	    NULL, &zp->z_size, sizeof (zp->z_size));
 
 	if (end == 0) {
 		zp->z_pflags &= ~ZFS_SPARSE;
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &zp->z_pflags, 8);
 	}
 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
 	 * a deadlock with someone trying to push a page that we are
 	 * about to invalidate.
 	 */
 	vnode_pager_setsize(vp, end);
 
 	zfs_range_unlock(rl);
 
 	return (0);
 }
 
 /*
  * Free space in a file
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of range
  *		len	- end of range (0 => EOF)
  *		flag	- current file open mode flags.
  *		log	- TRUE if this action should be logged
  *
  *	RETURN:	0 on success, error code on failure
  */
 int
 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
 {
 	vnode_t *vp = ZTOV(zp);
 	dmu_tx_t *tx;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zilog_t *zilog = zfsvfs->z_log;
 	uint64_t mode;
 	uint64_t mtime[2], ctime[2];
 	sa_bulk_attr_t bulk[3];
 	int count = 0;
 	int error;
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
 	    sizeof (mode))) != 0)
 		return (error);
 
 	if (off > zp->z_size) {
 		error =  zfs_extend(zp, off+len);
 		if (error == 0 && log)
 			goto log;
 		else
 			return (error);
 	}
 
 	/*
 	 * Check for any locks in the region to be freed.
 	 */
 
 	if (MANDLOCK(vp, (mode_t)mode)) {
 		uint64_t length = (len ? len : zp->z_size - off);
 		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
 			return (error);
 	}
 
 	if (len == 0) {
 		error = zfs_trunc(zp, off);
 	} else {
 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
 		    off + len > zp->z_size)
 			error = zfs_extend(zp, off+len);
 	}
 	if (error || !log)
 		return (error);
 log:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 	    NULL, &zp->z_pflags, 8);
 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 	ASSERT(error == 0);
 
 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
 	uint64_t	moid, obj, sa_obj, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
 	nvpair_t	*elem;
 	int		error;
 	int		i;
 	znode_t		*rootzp = NULL;
 	zfsvfs_t	*zfsvfs;
 	vattr_t		vattr;
 	znode_t		*zp;
 	zfs_acl_ids_t	acl_ids;
 
 	/*
 	 * First attempt to create master node.
 	 */
 	/*
 	 * In an empty objset, there are no blocks to read and thus
 	 * there can be no i/o errors (which we assert below).
 	 */
 	moid = MASTER_NODE_OBJ;
 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Set starting attributes.
 	 */
 	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
 		/* For the moment we expect all zpl props to be uint64_ts */
 		uint64_t val;
 		char *name;
 
 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
 		name = nvpair_name(elem);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
 			if (val < version)
 				version = val;
 		} else {
 			error = zap_update(os, moid, name, 8, 1, &val, tx);
 		}
 		ASSERT(error == 0);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
 			norm = val;
 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
 			sense = val;
 	}
 	ASSERT(version != 0);
 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
 
 	/*
 	 * Create zap object used for SA attribute registration
 	 */
 
 	if (version >= ZPL_VERSION_SA) {
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT(error == 0);
 	} else {
 		sa_obj = 0;
 	}
 	/*
 	 * Create a delete queue.
 	 */
 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
 
 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
 	 * to allow zfs_mknode to work.
 	 */
 	VATTR_NULL(&vattr);
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0755;
 	vattr.va_uid = crgetuid(cr);
 	vattr.va_gid = crgetgid(cr);
 
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
 	rootzp->z_moved = 0;
 	rootzp->z_unlinked = 0;
 	rootzp->z_atime_dirty = 0;
 	rootzp->z_is_sa = USE_SA(version, os);
 
 	zfsvfs->z_os = os;
 	zfsvfs->z_parent = zfsvfs;
 	zfsvfs->z_version = version;
 	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
 	zfsvfs->z_use_sa = USE_SA(version, os);
 	zfsvfs->z_norm = norm;
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 
 	ASSERT(error == 0);
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	rootzp->z_zfsvfs = zfsvfs;
 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
 	    cr, NULL, &acl_ids));
 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, rootzp);
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
 	ASSERT(error == 0);
 	zfs_acl_ids_free(&acl_ids);
 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
 
 	sa_handle_destroy(rootzp->z_sa_hdl);
 	kmem_cache_free(znode_cache, rootzp);
 
 	/*
 	 * Create shares directory
 	 */
 
 	error = zfs_create_share_dir(zfsvfs, tx);
 
 	ASSERT(error == 0);
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 #endif /* _KERNEL */
 
 static int
 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
 {
 	uint64_t sa_obj = 0;
 	int error;
 
 	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
 	if (error != 0 && error != ENOENT)
 		return (error);
 
 	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
 	return (error);
 }
 
 static int
 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
     dmu_buf_t **db, void *tag)
 {
 	dmu_object_info_t doi;
 	int error;
 
 	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
 		return (error);
 
 	dmu_object_info_from_db(*db, &doi);
 	if ((doi.doi_bonus_type != DMU_OT_SA &&
 	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
 	    doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
 		sa_buf_rele(*db, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
 	if (error != 0) {
 		sa_buf_rele(*db, tag);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
 {
 	sa_handle_destroy(hdl);
 	sa_buf_rele(db, tag);
 }
 
 /*
  * Given an object number, return its parent object number and whether
  * or not the object is an extended attribute directory.
  */
 static int
 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
     uint64_t *pobjp, int *is_xattrdir)
 {
 	uint64_t parent;
 	uint64_t pflags;
 	uint64_t mode;
 	uint64_t parent_mode;
 	sa_bulk_attr_t bulk[3];
 	sa_handle_t *sa_hdl;
 	dmu_buf_t *sa_db;
 	int count = 0;
 	int error;
 
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
 	    &parent, sizeof (parent));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
 	    &pflags, sizeof (pflags));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
 	    &mode, sizeof (mode));
 
 	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
 		return (error);
 
 	/*
 	 * When a link is removed its parent pointer is not changed and will
 	 * be invalid.  There are two cases where a link is removed but the
 	 * file stays around, when it goes to the delete queue and when there
 	 * are additional links.
 	 */
 	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
 	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
 	if (error != 0)
 		return (error);
 
 	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
 
 	/*
 	 * Extended attributes can be applied to files, directories, etc.
 	 * Otherwise the parent must be a directory.
 	 */
 	if (!*is_xattrdir && !S_ISDIR(parent_mode))
 		return (SET_ERROR(EINVAL));
 
 	*pobjp = parent;
 
 	return (0);
 }
 
 /*
  * Given an object number, return some zpl level statistics
  */
 static int
 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
     zfs_stat_t *sb)
 {
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
 	    &sb->zs_mode, sizeof (sb->zs_mode));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
 	    &sb->zs_gen, sizeof (sb->zs_gen));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
 	    &sb->zs_links, sizeof (sb->zs_links));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
 	    &sb->zs_ctime, sizeof (sb->zs_ctime));
 
 	return (sa_bulk_lookup(hdl, bulk, count));
 }
 
 static int
 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
     sa_attr_type_t *sa_table, char *buf, int len)
 {
 	sa_handle_t *sa_hdl;
 	sa_handle_t *prevhdl = NULL;
 	dmu_buf_t *prevdb = NULL;
 	dmu_buf_t *sa_db = NULL;
 	char *path = buf + len - 1;
 	int error;
 
 	*path = '\0';
 	sa_hdl = hdl;
 
 	for (;;) {
 		uint64_t pobj;
 		char component[MAXNAMELEN + 2];
 		size_t complen;
 		int is_xattrdir;
 
 		if (prevdb)
 			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
 
 		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
 		    &is_xattrdir)) != 0)
 			break;
 
 		if (pobj == obj) {
 			if (path[0] != '/')
 				*--path = '/';
 			break;
 		}
 
 		component[0] = '/';
 		if (is_xattrdir) {
 			(void) sprintf(component + 1, "<xattrdir>");
 		} else {
 			error = zap_value_search(osp, pobj, obj,
 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
 			if (error != 0)
 				break;
 		}
 
 		complen = strlen(component);
 		path -= complen;
 		ASSERT(path >= buf);
 		bcopy(component, path, complen);
 		obj = pobj;
 
 		if (sa_hdl != hdl) {
 			prevhdl = sa_hdl;
 			prevdb = sa_db;
 		}
 		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
 		if (error != 0) {
 			sa_hdl = prevhdl;
 			sa_db = prevdb;
 			break;
 		}
 	}
 
 	if (sa_hdl != NULL && sa_hdl != hdl) {
 		ASSERT(sa_db != NULL);
 		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
 	}
 
 	if (error == 0)
 		(void) memmove(buf, path, buf + len - path);
 
 	return (error);
 }
 
 int
 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 {
 	sa_attr_type_t *sa_table;
 	sa_handle_t *hdl;
 	dmu_buf_t *db;
 	int error;
 
 	error = zfs_sa_setup(osp, &sa_table);
 	if (error != 0)
 		return (error);
 
 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
 
 	zfs_release_sa_handle(hdl, db, FTAG);
 	return (error);
 }
 
 int
 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
     char *buf, int len)
 {
 	char *path = buf + len - 1;
 	sa_attr_type_t *sa_table;
 	sa_handle_t *hdl;
 	dmu_buf_t *db;
 	int error;
 
 	*path = '\0';
 
 	error = zfs_sa_setup(osp, &sa_table);
 	if (error != 0)
 		return (error);
 
 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
 	if (error != 0) {
 		zfs_release_sa_handle(hdl, db, FTAG);
 		return (error);
 	}
 
 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
 
 	zfs_release_sa_handle(hdl, db, FTAG);
 	return (error);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	(revision 296519)
@@ -1,2141 +1,2142 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 
 /*
  * The zfs intent log (ZIL) saves transaction records of system calls
  * that change the file system in memory with enough information
  * to be able to replay them. These are stored in memory until
  * either the DMU transaction group (txg) commits them to the stable pool
  * and they can be discarded, or they are flushed to the stable log
  * (also in the pool) due to a fsync, O_DSYNC or other synchronous
  * requirement. In the event of a panic or power fail then those log
  * records (transactions) are replayed.
  *
  * There is one ZIL per file system. Its on-disk (pool) format consists
  * of 3 parts:
  *
  * 	- ZIL header
  * 	- ZIL blocks
  * 	- ZIL records
  *
  * A log record holds a system call transaction. Log blocks can
  * hold many log records and the blocks are chained together.
  * Each ZIL block contains a block pointer (blkptr_t) to the next
  * ZIL block in the chain. The ZIL header points to the first
  * block in the chain. Note there is not a fixed place in the pool
  * to hold blocks. They are dynamically allocated and freed as
  * needed from the blocks available. Figure X shows the ZIL structure:
  */
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
 int zil_replay_disable = 0;
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN,
     &zil_replay_disable, 0, "Disable intent logging replay");
 
 /*
  * Tunable parameter for debugging or performance analysis.  Setting
  * zfs_nocacheflush will cause corruption on power loss if a volatile
  * out-of-order write cache is enabled.
  */
 boolean_t zfs_nocacheflush = B_FALSE;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
     &zfs_nocacheflush, 0, "Disable cache flush");
 boolean_t zfs_trim_enabled = B_TRUE;
 SYSCTL_DECL(_vfs_zfs_trim);
 SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
     "Enable ZFS TRIM");
 
 static kmem_cache_t *zil_lwb_cache;
 
 static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
 
 #define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
     sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
 
 
 /*
  * ziltest is by and large an ugly hack, but very useful in
  * checking replay without tedious work.
  * When running ziltest we want to keep all itx's and so maintain
  * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
  * We subtract TXG_CONCURRENT_STATES to allow for common code.
  */
 #define	ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
 		return (-1);
 	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
 		return (1);
 
 	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
 		return (-1);
 	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
 		return (1);
 
 	return (0);
 }
 
 static void
 zil_bp_tree_init(zilog_t *zilog)
 {
 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
 zil_bp_tree_fini(zilog_t *zilog)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
 int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	dva = BP_IDENTITY(bp);
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
 	return (0);
 }
 
 static zil_header_t *
 zil_header_in_syncing_context(zilog_t *zilog)
 {
 	return ((zil_header_t *)zilog->zl_header);
 }
 
 static void
 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 {
 	zio_cksum_t *zc = &bp->blk_cksum;
 
 	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
 	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
     char **end)
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
 		 * Validate the checksummed log block.
 		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
 		 *
 		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = abuf->b_data;
 			char *lr = (char *)(zilc + 1);
 			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
 
 			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, len);
 				*end = (char *)dst + len;
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
 			char *lr = abuf->b_data;
 			uint64_t size = BP_GET_LSIZE(bp);
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				ASSERT3U(zilc->zc_nused, <=,
 				    SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, zilc->zc_nused);
 				*end = (char *)dst + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 
 		VERIFY(arc_buf_remove_ref(abuf, &abuf));
 	}
 
 	return (error);
 }
 
 /*
  * Read a TX_WRITE log data block.
  */
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
 			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
 int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	boolean_t claimed = !!zh->zh_claim_txg;
 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 	uint64_t max_blk_seq = 0;
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
 	blkptr_t blk, next_blk;
 	char *lrbuf, *lrp;
 	int error = 0;
 
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
 	 * For each block in the chain we strongly check that block to
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
 	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
 		char *end;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 		if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
 			break;
 		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
 		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
 		if (error != 0)
 			break;
 
 		for (lrp = lrbuf; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
 			if (lr->lrc_seq > claim_lr_seq)
 				goto done;
 			if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
 				goto done;
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
 	}
 done:
 	zilog->zl_parse_error = error;
 	zilog->zl_parse_blk_seq = max_blk_seq;
 	zilog->zl_parse_lr_seq = max_lr_seq;
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
 	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
 	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
 
 	zil_bp_tree_fini(zilog);
 	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
 
 	return (error);
 }
 
 static int
 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 {
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
 	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
 static int
 zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
 	if (lrc->lrc_txtype != TX_WRITE)
 		return (0);
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
 	 * in normal operation when a log block is written to disk before
 	 * some of the dmu_sync() blocks it points to.  In this case, the
 	 * transaction cannot have been committed to anyone (we would have
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
 	if (lr->lr_blkptr.blk_birth >= first_txg &&
 	    (error = zil_read_log_data(zilog, lr, NULL)) != 0)
 		return (error);
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 /* ARGSUSED */
 static int
 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
 {
 	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
 	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
 	    !BP_IS_HOLE(bp))
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static lwb_t *
 zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
 	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
 	lwb->lwb_max_txg = txg;
 	lwb->lwb_zio = NULL;
 	lwb->lwb_tx = NULL;
 	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 		lwb->lwb_nused = sizeof (zil_chain_t);
 		lwb->lwb_sz = BP_GET_LSIZE(bp);
 	} else {
 		lwb->lwb_nused = 0;
 		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
 }
 
 /*
  * Called when we create in-memory log transactions so that we know
  * to cleanup the itxs at the end of spa_sync().
  */
 void
 zilog_dirty(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, zilog);
 	}
 }
 
 boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Create an on-disk intent log.
  */
 static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	ASSERT(zh->zh_claim_txg == 0);
 	ASSERT(zh->zh_replay_seq == 0);
 
 	blk = zh->zh_log;
 
 	/*
 	 * Allocate an initial log block if:
 	 *    - there isn't one already
 	 *    - the existing block is the wrong endianess
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
 			zio_free_zil(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
 		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
 
 	/*
 	 * Allocate a log write buffer (lwb) for the first log block.
 	 */
 	if (error == 0)
 		lwb = zil_alloc_lwb(zilog, &blk, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
 	 * and wait for zil_sync() to stuff the block poiner into zh_log.
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	}
 
 	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 
 	return (lwb);
 }
 
 /*
  * In one tx, free all log blocks and clear the log header.
  * If keep_first is set, then we're replaying a log with no content.
  * We want to keep the first block, however, so that the first
  * synchronous transaction doesn't require a txg_wait_synced()
  * in zil_create().  We don't need to txg_wait_synced() here either
  * when keep_first is set, because both zil_create() and zil_destroy()
  * will wait for any in-progress destroys to complete.
  */
 void
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
 		return;
 
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
 			kmem_cache_free(zil_lwb_cache, lwb);
 		}
 	} else if (!keep_first) {
 		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 }
 
 void
 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	(void) zil_parse(zilog, zil_free_log_block,
 	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
 }
 
 int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	uint64_t first_txg = dmu_tx_get_txg(tx);
 	zilog_t *zilog;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_own_obj(dp, ds->ds_object,
 	    DMU_OST_ANY, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
 			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
 			    (unsigned long long)ds->ds_object, error);
 		}
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 
 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
 		if (!BP_IS_HOLE(&zh->zh_log))
 			zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
 		BP_ZERO(&zh->zh_log);
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, FTAG);
 		return (0);
 	}
 
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
 	 * but we can read the entire log later, we will not try to replay
 	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg);
 		zh->zh_claim_txg = first_txg;
 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_disown(os, FTAG);
 	return (0);
 }
 
 /*
  * Check the log by walking the log chain.
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
 /* ARGSUSED */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
 	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
 		cmn_err(CE_WARN, "can't open objset %llu, error %d",
 		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
 	/*
 	 * Check the first block and determine if it's on a log device
 	 * which may have been removed or faulted prior to loading this
 	 * pool.  If so, there's no point in checking the rest of the log
 	 * as its content should have already been synced to the pool.
 	 */
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
 		if (!valid)
 			return (0);
 	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
 	 * any blocks, but just determine whether it is possible to do so.
 	 * In addition to checking the log chain, zil_claim_log_block()
 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 static int
 zil_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
 	if (v1 < v2)
 		return (-1);
 	if (v1 > v2)
 		return (1);
 
 	return (0);
 }
 
 void
 zil_add_block(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_vdev_tree;
 	avl_index_t where;
 	zil_vdev_node_t *zv, zvsearch;
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
 	if (zfs_nocacheflush)
 		return;
 
 	ASSERT(zilog->zl_writer);
 
 	/*
 	 * Even though we're zl_writer, we still need a lock because the
 	 * zl_get_data() callbacks may have dmu_sync() done callbacks
 	 * that will run concurrently.
 	 */
 	mutex_enter(&zilog->zl_vdev_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 			zv->zv_vdev = zvsearch.zv_vdev;
 			avl_insert(t, zv, where);
 		}
 	}
 	mutex_exit(&zilog->zl_vdev_lock);
 }
 
 static void
 zil_flush_vdevs(zilog_t *zilog)
 {
 	spa_t *spa = zilog->zl_spa;
 	avl_tree_t *t = &zilog->zl_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 	zio_t *zio;
 
 	ASSERT(zilog->zl_writer);
 
 	/*
 	 * We don't need zl_vdev_lock here because we're the zl_writer,
 	 * and all zl_get_data() callbacks are done.
 	 */
 	if (avl_numnodes(t) == 0)
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
 		if (vd != NULL)
 			zio_flush(zio, vd);
 		kmem_free(zv, sizeof (*zv));
 	}
 
 	/*
 	 * Wait for all the flushes to complete.  Not all devices actually
 	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
 	 */
 	(void) zio_wait(zio);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 }
 
 /*
  * Function called when a log block write completes
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
 	dmu_tx_t *tx = lwb->lwb_tx;
 
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
 	ASSERT(!BP_IS_GANG(zio->io_bp));
 	ASSERT(!BP_IS_HOLE(zio->io_bp));
 	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
 
 	/*
 	 * Ensure the lwb buffer pointer is cleared before releasing
 	 * the txg. If we have had an allocation failure and
 	 * the txg is waiting to sync then we want want zil_sync()
 	 * to remove the lwb so that it's not picked up as the next new
 	 * one in zil_commit_writer(). zil_sync() will only remove
 	 * the lwb if lwb_buf is null.
 	 */
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_buf = NULL;
 	lwb->lwb_tx = NULL;
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * Now that we've written this log block, we have a stable pointer
 	 * to the next block in the chain, so it's OK to let the txg in
 	 * which we allocated the next block sync.
 	 */
 	dmu_tx_commit(tx);
 }
 
 /*
  * Initialize the io for a log block.
  */
 static void
 zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 {
 	zbookmark_phys_t zb;
 
 	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	if (zilog->zl_root_zio == NULL) {
 		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	}
 	if (lwb->lwb_zio == NULL) {
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
 		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
 		    zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 	}
 }
 
 /*
  * Define a limited set of intent log block sizes.
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
  * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 uint64_t zil_block_buckets[] = {
     4096,		/* non TX_WRITE */
     8192+4096,		/* data base */
     32*1024 + 4096, 	/* NFS writes */
     UINT64_MAX
 };
 
 /*
  * Use the slog as long as the logbias is 'latency' and the current commit size
  * is less than the limit or the total list size is less than 2X the limit.
  * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
  */
 uint64_t zil_slog_limit = 1024 * 1024;
 #define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
 	(((zilog)->zl_cur_used < zil_slog_limit) || \
 	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
 
 /*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
  */
 static lwb_t *
 zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 {
 	lwb_t *nlwb = NULL;
 	zil_chain_t *zilc;
 	spa_t *spa = zilog->zl_spa;
 	blkptr_t *bp;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	uint64_t zil_blksz, wsz;
 	int i, error;
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 		bp = &zilc->zc_next_blk;
 	} else {
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
 		bp = &zilc->zc_next_blk;
 	}
 
 	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
 
 	/*
 	 * Allocate the next block and save its address in this block
 	 * before writing it in order to establish the log chain.
 	 * Note that if the allocation of nlwb synced before we wrote
 	 * the block that points at it (lwb), we'd leak it if we crashed.
 	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
 	 * We dirty the dataset to ensure that zil_sync() will be called
 	 * to clean up in the event of allocation failure or I/O failure.
 	 */
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	lwb->lwb_tx = tx;
 
 	/*
 	 * Log blocks are pre-allocated. Here we select the size of the next
 	 * block, based on size used in the last block.
 	 * - first find the smallest bucket that will fit the block from a
 	 *   limited set of block sizes. This is because it's faster to write
 	 *   blocks allocated from the same metaslab as they are adjacent or
 	 *   close.
 	 * - next find the maximum from the new suggested size and an array of
 	 *   previous sizes. This lessens a picket fence effect of wrongly
 	 *   guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
 	 *   requests.
 	 *
 	 * Note we only write what is used, but we can't just allocate
 	 * the maximum block size because we can exhaust the available
 	 * pool log space.
 	 */
 	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
 	for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
 		continue;
 	zil_blksz = zil_block_buckets[i];
 	if (zil_blksz == UINT64_MAX)
 		zil_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
 	for (i = 0; i < ZIL_PREV_BLKS; i++)
 		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
 	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	BP_ZERO(bp);
 	/* pass the old blkptr in order to spread log blocks across devs */
 	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
 	    USE_SLOG(zilog));
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		/*
 		 * Allocate a new log write buffer (lwb).
 		 */
 		nlwb = zil_alloc_lwb(zilog, bp, txg);
 
 		/* Record the block for later vdev flushing */
 		zil_add_block(zilog, &lwb->lwb_blk);
 	}
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		/* For Slim ZIL only write what is used. */
 		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
 		ASSERT3U(wsz, <=, lwb->lwb_sz);
 		zio_shrink(lwb->lwb_zio, wsz);
 
 	} else {
 		wsz = lwb->lwb_sz;
 	}
 
 	zilc->zc_pad = 0;
 	zilc->zc_nused = lwb->lwb_nused;
 	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 
 	/*
 	 * clear unused data for security
 	 */
 	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
 
 	zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
 
 	/*
 	 * If there was an allocation failure then nlwb will be null which
 	 * forces a txg_wait_synced().
 	 */
 	return (nlwb);
 }
 
 static lwb_t *
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
 	lr_t *lrc = &itx->itx_lr; /* common log record */
 	lr_write_t *lrw = (lr_write_t *)lrc;
 	char *lr_buf;
 	uint64_t txg = lrc->lrc_txg;
 	uint64_t reclen = lrc->lrc_reclen;
 	uint64_t dlen = 0;
 
 	if (lwb == NULL)
 		return (NULL);
 
 	ASSERT(lwb->lwb_buf != NULL);
 	ASSERT(zilog_is_dirty(zilog) ||
 	    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 
 	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 
 	zilog->zl_cur_used += (reclen + dlen);
 
 	zil_lwb_write_init(zilog, lwb);
 
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 */
 	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
 		lwb = zil_lwb_write_start(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_init(zilog, lwb);
 		ASSERT(LWB_EMPTY(lwb));
 		if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
 			return (lwb);
 		}
 	}
 
 	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
 	bcopy(lrc, lr_buf, reclen);
 	lrc = (lr_t *)lr_buf;
 	lrw = (lr_write_t *)lrc;
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
 	if (lrc->lrc_txtype == TX_WRITE) {
 		if (txg > spa_freeze_txg(zilog->zl_spa))
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
 		if (itx->itx_wr_state != WR_COPIED) {
 			char *dbuf;
 			int error;
 
 			if (dlen) {
 				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
 				dbuf = lr_buf + reclen;
 				lrw->lr_common.lrc_reclen += dlen;
 			} else {
 				ASSERT(itx->itx_wr_state == WR_INDIRECT);
 				dbuf = NULL;
 			}
 			error = zilog->zl_get_data(
 			    itx->itx_private, lrw, dbuf, lwb->lwb_zio);
 			if (error == EIO) {
 				txg_wait_synced(zilog->zl_dmu_pool, txg);
 				return (lwb);
 			}
 			if (error != 0) {
 				ASSERT(error == ENOENT || error == EEXIST ||
 				    error == EALREADY);
 				return (lwb);
 			}
 		}
 	}
 
 	/*
 	 * We're actually making an entry, so update lrc_seq to be the
 	 * log record sequence number.  Note that this is generally not
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
 	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
 	lwb->lwb_nused += reclen + dlen;
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	return (lwb);
 }
 
 itx_t *
 zil_itx_create(uint64_t txtype, size_t lrsize)
 {
 	itx_t *itx;
 
 	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
 
 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 
 	return (itx);
 }
 
 void
 zil_itx_destroy(itx_t *itx)
 {
 	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
 }
 
 /*
  * Free up the sync and async itxs. The itxs_t has already been detached
  * so no locks are needed.
  */
 static void
 zil_itxg_clean(itxs_t *itxs)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
 	while ((itx = list_head(list)) != NULL) {
 		list_remove(list, itx);
 		kmem_free(itx, offsetof(itx_t, itx_lr) +
 		    itx->itx_lr.lrc_reclen);
 	}
 
 	cookie = NULL;
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
 		while ((itx = list_head(list)) != NULL) {
 			list_remove(list, itx);
 			kmem_free(itx, offsetof(itx_t, itx_lr) +
 			    itx->itx_lr.lrc_reclen);
 		}
 		list_destroy(list);
 		kmem_free(ian, sizeof (itx_async_node_t));
 	}
 	avl_destroy(t);
 
 	kmem_free(itxs, sizeof (itxs_t));
 }
 
 static int
 zil_aitx_compare(const void *x1, const void *x2)
 {
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
 	if (o1 < o2)
 		return (-1);
 	if (o1 > o2)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Remove all async itx with the given oid.
  */
 static void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
 	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		ian = avl_find(t, &oid, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
 	while ((itx = list_head(&clean_list)) != NULL) {
 		list_remove(&clean_list, itx);
 		kmem_free(itx, offsetof(itx_t, itx_lr) +
 		    itx->itx_lr.lrc_reclen);
 	}
 	list_destroy(&clean_list);
 }
 
 void
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t txg;
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
 	/*
 	 * Object ids can be re-instantiated in the next txg so
 	 * remove any async transactions to avoid future leaks.
 	 * This can happen if a fsync occurs on the re-instantiated
 	 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
 	 * the new file data and flushes a write record for the old object.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
 		zil_remove_async(zilog, itx->itx_oid);
 
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
 
 	itxg = &zilog->zl_itxg[txg & TXG_MASK];
 	mutex_enter(&itxg->itxg_lock);
 	itxs = itxg->itxg_itxs;
 	if (itxg->itxg_txg != txg) {
 		if (itxs != NULL) {
 			/*
 			 * The zil_clean callback hasn't got around to cleaning
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
 			atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
 			itxg->itxg_sod = 0;
 			clean = itxg->itxg_itxs;
 		}
 		ASSERT(itxg->itxg_sod == 0);
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
 
 		list_create(&itxs->i_sync_list, sizeof (itx_t),
 		    offsetof(itx_t, itx_node));
 		avl_create(&itxs->i_async_tree, zil_aitx_compare,
 		    sizeof (itx_async_node_t),
 		    offsetof(itx_async_node_t, ia_node));
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
 		atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
 		itxg->itxg_sod += itx->itx_sod;
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
 		itx_async_node_t *ian;
 		avl_index_t where;
 
 		ian = avl_find(t, &foid, &where);
 		if (ian == NULL) {
 			ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
 			list_create(&ian->ia_list, sizeof (itx_t),
 			    offsetof(itx_t, itx_node));
 			ian->ia_foid = foid;
 			avl_insert(t, ian, where);
 		}
 		list_insert_tail(&ian->ia_list, itx);
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
 	zilog_dirty(zilog, txg);
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
 	if (clean != NULL)
 		zil_itxg_clean(clean);
 }
 
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them. We should only do this after we
  * have written out the uberblocks (i.e. txg has been comitted) so that
  * don't inadvertently clean out in-memory log records that would be required
  * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
 	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
 	itxs_t *clean_me;
 
 	mutex_enter(&itxg->itxg_lock);
 	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
 		mutex_exit(&itxg->itxg_lock);
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT(itxg->itxg_txg != 0);
 	ASSERT(zilog->zl_clean_taskq != NULL);
 	atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
 	itxg->itxg_sod = 0;
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
 	mutex_exit(&itxg->itxg_lock);
 	/*
 	 * Preferably start a task queue to free up the old itxs but
 	 * if taskq_dispatch can't allocate resources to do that then
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
 	if (taskq_dispatch(zilog->zl_clean_taskq,
 	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
 		zil_itxg_clean(clean_me);
 }
 
 /*
  * Get the list of itxs to commit into zl_itx_commit_list.
  */
 static void
 zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 	uint64_t push_sod = 0;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
 		push_sod += itxg->itxg_sod;
 		itxg->itxg_sod = 0;
 
 		mutex_exit(&itxg->itxg_lock);
 	}
 	atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
 static void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian;
 	avl_tree_t *t;
 	avl_index_t where;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If a foid is specified then find that node and append its
 		 * list. Otherwise walk the tree appending all the lists
 		 * to the sync list. We add to the end rather than the
 		 * beginning to ensure the create has happened.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
 			ian = avl_find(t, &foid, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 			}
 		} else {
 			void *cookie = NULL;
 
 			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 				list_destroy(&ian->ia_list);
 				kmem_free(ian, sizeof (itx_async_node_t));
 			}
 		}
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 static void
 zil_commit_writer(zilog_t *zilog)
 {
 	uint64_t txg;
 	itx_t *itx;
 	lwb_t *lwb;
 	spa_t *spa = zilog->zl_spa;
 	int error = 0;
 
 	ASSERT(zilog->zl_root_zio == NULL);
 
 	mutex_exit(&zilog->zl_lock);
 
 	zil_get_commit_list(zilog);
 
 	/*
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
 	if (list_head(&zilog->zl_itx_commit_list) == NULL) {
 		mutex_enter(&zilog->zl_lock);
 		return;
 	}
 
 	if (zilog->zl_suspend) {
 		lwb = NULL;
 	} else {
 		lwb = list_tail(&zilog->zl_lwb_list);
 		if (lwb == NULL)
 			lwb = zil_create(zilog);
 	}
 
 	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
 	while (itx = list_head(&zilog->zl_itx_commit_list)) {
 		txg = itx->itx_lr.lrc_txg;
 		ASSERT(txg);
 
 		if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
 			lwb = zil_lwb_commit(zilog, itx, lwb);
 		list_remove(&zilog->zl_itx_commit_list, itx);
 		kmem_free(itx, offsetof(itx_t, itx_lr)
 		    + itx->itx_lr.lrc_reclen);
 	}
 	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
 
 	/* write the last block out */
 	if (lwb != NULL && lwb->lwb_zio != NULL)
 		lwb = zil_lwb_write_start(zilog, lwb);
 
 	zilog->zl_cur_used = 0;
 
 	/*
 	 * Wait if necessary for the log blocks to be on stable storage.
 	 */
 	if (zilog->zl_root_zio) {
 		error = zio_wait(zilog->zl_root_zio);
 		zilog->zl_root_zio = NULL;
 		zil_flush_vdevs(zilog);
 	}
 
 	if (error || lwb == NULL)
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	mutex_enter(&zilog->zl_lock);
 
 	/*
 	 * Remember the highest committed log sequence number for ztest.
 	 * We only update this value when all the log writes succeeded,
 	 * because ztest wants to ASSERT that it got the whole log chain.
 	 */
 	if (error == 0 && lwb != NULL)
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 }
 
 /*
  * Commit zfs transactions to stable storage.
  * If foid is 0 push out all transactions, otherwise push only those
  * for that object or might reference that object.
  *
  * itxs are committed in batches. In a heavily stressed zil there will be
  * a commit writer thread who is writing out a bunch of itxs to the log
  * for a set of committing threads (cthreads) in the same batch as the writer.
  * Those cthreads are all waiting on the same cv for that batch.
  *
  * There will also be a different and growing batch of threads that are
  * waiting to commit (qthreads). When the committing batch completes
  * a transition occurs such that the cthreads exit and the qthreads become
  * cthreads. One of the new cthreads becomes the writer thread for the
  * batch. Any new threads arriving become new qthreads.
  *
  * Only 2 condition variables are needed and there's no transition
  * between the two cvs needed. They just flip-flop between qthreads
  * and cthreads.
  *
  * Using this scheme we can efficiently wakeup up only those threads
  * that have been committed.
  */
 void
 zil_commit(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t mybatch;
 
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return;
 
 	/* move the async itxs for the foid to the sync queues */
 	zil_async_to_sync(zilog, foid);
 
 	mutex_enter(&zilog->zl_lock);
 	mybatch = zilog->zl_next_batch;
 	while (zilog->zl_writer) {
 		cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock);
 		if (mybatch <= zilog->zl_com_batch) {
 			mutex_exit(&zilog->zl_lock);
 			return;
 		}
 	}
 
 	zilog->zl_next_batch++;
 	zilog->zl_writer = B_TRUE;
 	zil_commit_writer(zilog);
 	zilog->zl_com_batch = mybatch;
 	zilog->zl_writer = B_FALSE;
 	mutex_exit(&zilog->zl_lock);
 
 	/* wake up one thread to become the next writer */
 	cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]);
 
 	/* wake up all threads waiting for this batch to be committed */
 	cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]);
 }
 
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
 	 * We don't zero out zl_destroy_txg, so make sure we don't try
 	 * to destroy it twice.
 	 */
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
 	if (*replayed_seq != 0) {
 		ASSERT(zh->zh_replay_seq < *replayed_seq);
 		zh->zh_replay_seq = *replayed_seq;
 		*replayed_seq = 0;
 	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 
 		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
 
 		bzero(zh, sizeof (zil_header_t));
 		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
 			 * If this block was part of log chain that couldn't
 			 * be claimed because a device was missing during
 			 * zil_claim(), but that device later returns,
 			 * then this block could erroneously appear valid.
 			 * To guard against this, assign a new GUID to the new
 			 * log chain so it doesn't matter what blk points to.
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_free_zil(spa, txg, &lwb->lwb_blk);
 		kmem_cache_free(zil_lwb_cache, lwb);
 
 		/*
 		 * If we don't have anything left in the lwb list then
 		 * we've had an allocation failure and we need to zero
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
 		if (list_head(&zilog->zl_lwb_list) == NULL)
 			BP_ZERO(&zh->zh_log);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 void
 zil_init(void)
 {
 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
 	    sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 zil_fini(void)
 {
 	kmem_cache_destroy(zil_lwb_cache);
 }
 
 void
 zil_set_sync(zilog_t *zilog, uint64_t sync)
 {
 	zilog->zl_sync = sync;
 }
 
 void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
 }
 
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
 	zilog_t *zilog;
 
 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
 
 	zilog->zl_header = zh_phys;
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
 	zilog->zl_sync = dmu_objset_syncprop(os);
 	zilog->zl_next_batch = 1;
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
 	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
 
 	mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
 
 	cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL);
 
 	return (zilog);
 }
 
 void
 zil_free(zilog_t *zilog)
 {
 	zilog->zl_stop_sync = 1;
 
 	ASSERT0(zilog->zl_suspend);
 	ASSERT0(zilog->zl_suspending);
 
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	avl_destroy(&zilog->zl_vdev_tree);
 	mutex_destroy(&zilog->zl_vdev_lock);
 
 	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
 	list_destroy(&zilog->zl_itx_commit_list);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * It's possible for an itx to be generated that doesn't dirty
 		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
 		 * callback to remove the entry. We remove those here.
 		 *
 		 * Also free up the ziltest itxs.
 		 */
 		if (zilog->zl_itxg[i].itxg_itxs)
 			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
 		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
 	}
 
 	mutex_destroy(&zilog->zl_lock);
 
 	cv_destroy(&zilog->zl_cv_writer);
 	cv_destroy(&zilog->zl_cv_suspend);
 	cv_destroy(&zilog->zl_cv_batch[0]);
 	cv_destroy(&zilog->zl_cv_batch[1]);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * Open an intent log.
  */
 zilog_t *
 zil_open(objset_t *os, zil_get_data_t *get_data)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	ASSERT(zilog->zl_clean_taskq == NULL);
 	ASSERT(zilog->zl_get_data == NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
 	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
 	    2, 2, TASKQ_PREPOPULATE);
 
 	return (zilog);
 }
 
 /*
  * Close an intent log.
  */
 void
 zil_close(zilog_t *zilog)
 {
 	lwb_t *lwb;
 	uint64_t txg = 0;
 
 	zil_commit(zilog, 0); /* commit all itx */
 
 	/*
 	 * The lwb_max_txg for the stubby lwb will reflect the last activity
 	 * for the zil.  After a txg_wait_synced() on the txg we know all the
 	 * callbacks have occurred that may clean the zil.  Only then can we
 	 * destroy the zl_clean_taskq.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb != NULL)
 		txg = lwb->lwb_max_txg;
 	mutex_exit(&zilog->zl_lock);
 	if (txg)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	ASSERT(!zilog_is_dirty(zilog));
 
 	taskq_destroy(zilog->zl_clean_taskq);
 	zilog->zl_clean_taskq = NULL;
 	zilog->zl_get_data = NULL;
 
 	/*
 	 * We should have only one LWB left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		kmem_cache_free(zil_lwb_cache, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 static char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
  * On old version pools, we suspend the log briefly when taking a
  * snapshot so that it will have an empty intent log.
  *
  * Long holds are not really intended to be used the way we do here --
  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
  * could fail.  Therefore we take pains to only put a long hold if it is
  * actually necessary.  Fortunately, it will only be necessary if the
  * objset is currently mounted (or the ZVOL equivalent).  In that case it
  * will already have a long hold, so we are not really making things any worse.
  *
  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
  * zvol_state_t), and use their mechanism to prevent their hold from being
  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
  * very little gain.
  *
  * if cookiep == NULL, this does both the suspend & resume.
  * Otherwise, it returns with the dataset "long held", and the cookie
  * should be passed into zil_resume().
  */
 int
 zil_suspend(const char *osname, void **cookiep)
 {
 	objset_t *os;
 	zilog_t *zilog;
 	const zil_header_t *zh;
 	int error;
 
 	error = dmu_objset_hold(osname, suspend_tag, &os);
 	if (error != 0)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/*
 	 * Don't put a long hold in the cases where we can avoid it.  This
 	 * is when there is no cookie so we are doing a suspend & resume
 	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
 	 * for the suspend because it's already suspended, or there's no ZIL.
 	 */
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
 
 	if (zilog->zl_suspend > 1) {
 		/*
 		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
 
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
 		if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 		return (0);
 	}
 
 	/*
 	 * If there is no pointer to an on-disk block, this ZIL must not
 	 * be active (e.g. filesystem not mounted), so there's nothing
 	 * to clean up.
 	 */
 	if (BP_IS_HOLE(&zh->zh_log)) {
 		ASSERT(cookiep != NULL); /* fast path already handled */
 
 		*cookiep = os;
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
 
 	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	zil_commit(zilog, 0);
 
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
 	if (cookiep == NULL)
 		zil_resume(os);
 	else
 		*cookiep = os;
 	return (0);
 }
 
 void
 zil_resume(void *cookie)
 {
 	objset_t *os = cookie;
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
 	zil_replay_func_t **zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
 } zil_replay_arg_t;
 
 static int
 zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
 {
 	char name[MAXNAMELEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
 	dmu_objset_name(zilog->zl_os, name);
 
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
 	    (u_longlong_t)lr->lrc_seq,
 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 
 	return (error);
 }
 
 static int
 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int error = 0;
 
 	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
 		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
 		return (zil_replay_error(zilog, lr, EINVAL));
 
 	/*
 	 * If this record type can be logged out of order, the object
 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
 	if (TX_OOO(txtype)) {
 		error = dmu_object_info(zilog->zl_os,
 		    ((lr_ooo_t *)lr)->lr_foid, NULL);
 		if (error == ENOENT || error == EEXIST)
 			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
 	bcopy(lr, zr->zr_lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
 	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
 		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header sequence number to reflect the fact that
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
 	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
 		 * any removes then retry the transaction.  Note that we
 		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	zilog->zl_replay_blks++;
 
 	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  */
 void
 zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		zil_destroy(zilog, B_TRUE);
 		return;
 	}
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg);
 	kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 }
 
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
 		    zilog->zl_replaying_seq;
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /* ARGSUSED */
 int
 zil_vdev_offline(const char *osname, void *arg)
 {
 	int error;
 
 	error = zil_suspend(osname, NULL);
 	if (error != 0)
 		return (SET_ERROR(EEXIST));
 	return (0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	(revision 296519)
@@ -1,3657 +1,3658 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/trim_map.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
 #if defined(__amd64__)
 static int zio_use_uma = 1;
 #else
 static int zio_use_uma = 0;
 #endif
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
     "Use uma(9) for ZIO allocations");
 static int zio_exclude_metadata = 0;
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
     "Exclude metadata buffers from dumps as well");
 
 zio_trim_stats_t zio_trim_stats = {
 	{ "bytes",		KSTAT_DATA_UINT64,
 	  "Number of bytes successfully TRIMmed" },
 	{ "success",		KSTAT_DATA_UINT64,
 	  "Number of successful TRIM requests" },
 	{ "unsupported",	KSTAT_DATA_UINT64,
 	  "Number of TRIM requests that failed because TRIM is not supported" },
 	{ "failed",		KSTAT_DATA_UINT64,
 	  "Number of TRIM requests that failed for reasons other than not supported" },
 };
 
 static kstat_t *zio_trim_ksp;
 
 /*
  * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
 const char *zio_type_name[ZIO_TYPES] = {
 	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
 	"zio_ioctl"
 };
 
 /*
  * ==========================================================================
  * I/O kmem caches
  * ==========================================================================
  */
 kmem_cache_t *zio_cache;
 kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 
 #ifdef _KERNEL
 extern vmem_t *zio_alloc_arena;
 #endif
 
 #define	ZIO_PIPELINE_CONTINUE		0x100
 #define	ZIO_PIPELINE_STOP		0x101
 
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 #define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
  * Care should be taken when changing these values as they directly impact
  * spa_sync() performance. Tuning these values may introduce subtle performance
  * pathologies and should only be done in the context of performance analysis.
  * These tunables will eventually be removed and replaced with #defines once
  * enough analysis has been done to determine optimal values.
  *
  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  * regular blocks are not deferred.
  */
 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
     &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
     &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
     &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
 boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
 
 #ifdef ZFS_DEBUG
 int zio_buf_debug_limit = 16384;
 #else
 int zio_buf_debug_limit = 0;
 #endif
 
 void
 zio_init(void)
 {
 	size_t c;
 	zio_cache = kmem_cache_create("zio_cache",
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	if (!zio_use_uma)
 		goto out;
 
 	/*
 	 * For small buffers, we want a cache for each multiple of
 	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 	 * for each quarter-power of 2.
 	 */
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t p2 = size;
 		size_t align = 0;
 		size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
 
 #ifdef illumos
 #ifndef _KERNEL
 		/*
 		 * If we are using watchpoints, put each buffer on its own page,
 		 * to eliminate the performance overhead of trapping to the
 		 * kernel when modifying a non-watched buffer that shares the
 		 * page with a watched buffer.
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
 #endif
 #endif /* illumos */
 		if (size <= 4 * SPA_MINBLOCKSIZE) {
 			align = SPA_MINBLOCKSIZE;
 		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
 			align = MIN(p2 >> 2, PAGESIZE);
 		}
 
 		if (align != 0) {
 			char name[36];
 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, NULL, cflags);
 
 			/*
 			 * Since zio_data bufs do not appear in crash dumps, we
 			 * pass KMC_NOTOUCH so that no allocator metadata is
 			 * stored with the buffers.
 			 */
 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, NULL,
 			    cflags | KMC_NOTOUCH | KMC_NODEBUG);
 		}
 	}
 
 	while (--c != 0) {
 		ASSERT(zio_buf_cache[c] != NULL);
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 
 		ASSERT(zio_data_buf_cache[c] != NULL);
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 out:
 
 	zio_inject_init();
 
 	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
 	    KSTAT_TYPE_NAMED,
 	    sizeof(zio_trim_stats) / sizeof(kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zio_trim_ksp != NULL) {
 		zio_trim_ksp->ks_data = &zio_trim_stats;
 		kstat_install(zio_trim_ksp);
 	}
 }
 
 void
 zio_fini(void)
 {
 	size_t c;
 	kmem_cache_t *last_cache = NULL;
 	kmem_cache_t *last_data_cache = NULL;
 
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		if (zio_buf_cache[c] != last_cache) {
 			last_cache = zio_buf_cache[c];
 			kmem_cache_destroy(zio_buf_cache[c]);
 		}
 		zio_buf_cache[c] = NULL;
 
 		if (zio_data_buf_cache[c] != last_data_cache) {
 			last_data_cache = zio_data_buf_cache[c];
 			kmem_cache_destroy(zio_data_buf_cache[c]);
 		}
 		zio_data_buf_cache[c] = NULL;
 	}
 
 	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
 
 	if (zio_trim_ksp != NULL) {
 		kstat_delete(zio_trim_ksp);
 		zio_trim_ksp = NULL;
 	}
 }
 
 /*
  * ==========================================================================
  * Allocate and free I/O buffers
  * ==========================================================================
  */
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
 void *
 zio_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 	else
 		return (kmem_alloc(size, KM_SLEEP|flags));
 }
 
 /*
  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
  * crashdump if the kernel panics.  This exists so that we will limit the amount
  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
  * of kernel heap dumped to disk when the kernel panics)
  */
 void *
 zio_data_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 	else
 		return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
 }
 
 void
 zio_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		kmem_cache_free(zio_buf_cache[c], buf);
 	else
 		kmem_free(buf, size);
 }
 
 void
 zio_data_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		kmem_cache_free(zio_data_buf_cache[c], buf);
 	else
 		kmem_free(buf, size);
 }
 
 /*
  * ==========================================================================
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
 static void
 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
     zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
 	zt->zt_orig_data = zio->io_data;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
 	zt->zt_transform = transform;
 
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
 
 	zio->io_data = data;
 	zio->io_size = size;
 }
 
 static void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
 
 	while ((zt = zio->io_transform_stack) != NULL) {
 		if (zt->zt_transform != NULL)
 			zt->zt_transform(zio,
 			    zt->zt_orig_data, zt->zt_orig_size);
 
 		if (zt->zt_bufsize != 0)
 			zio_buf_free(zio->io_data, zt->zt_bufsize);
 
 		zio->io_data = zt->zt_orig_data;
 		zio->io_size = zt->zt_orig_size;
 		zio->io_transform_stack = zt->zt_next;
 
 		kmem_free(zt, sizeof (zio_transform_t));
 	}
 }
 
 /*
  * ==========================================================================
  * I/O transform callbacks for subblocks and decompression
  * ==========================================================================
  */
 static void
 zio_subblock(zio_t *zio, void *data, uint64_t size)
 {
 	ASSERT(zio->io_size > size);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		bcopy(zio->io_data, data, size);
 }
 
 static void
 zio_decompress(zio_t *zio, void *data, uint64_t size)
 {
 	if (zio->io_error == 0 &&
 	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 	    zio->io_data, data, zio->io_size, size) != 0)
 		zio->io_error = SET_ERROR(EIO);
 }
 
 /*
  * ==========================================================================
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
 /*
  * NOTE - Callers to zio_walk_parents() and zio_walk_children must
  *        continue calling these functions until they return NULL.
  *        Otherwise, the next caller will pick up the list walk in
  *        some indeterminate state.  (Otherwise every caller would
  *        have to pass in a cookie to keep the state represented by
  *        io_walk_link, which gets annoying.)
  */
 zio_t *
 zio_walk_parents(zio_t *cio)
 {
 	zio_link_t *zl = cio->io_walk_link;
 	list_t *pl = &cio->io_parent_list;
 
 	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 	cio->io_walk_link = zl;
 
 	if (zl == NULL)
 		return (NULL);
 
 	ASSERT(zl->zl_child == cio);
 	return (zl->zl_parent);
 }
 
 zio_t *
 zio_walk_children(zio_t *pio)
 {
 	zio_link_t *zl = pio->io_walk_link;
 	list_t *cl = &pio->io_child_list;
 
 	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 	pio->io_walk_link = zl;
 
 	if (zl == NULL)
 		return (NULL);
 
 	ASSERT(zl->zl_parent == pio);
 	return (zl->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
 	zio_t *pio = zio_walk_parents(cio);
 
 	VERIFY(zio_walk_parents(cio) == NULL);
 	return (pio);
 }
 
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT(cio->io_child_type <= pio->io_child_type);
 
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
 	pio->io_child_count++;
 	cio->io_parent_count++;
 
 	mutex_exit(&pio->io_lock);
 	mutex_exit(&cio->io_lock);
 }
 
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
 	ASSERT(zl->zl_parent == pio);
 	ASSERT(zl->zl_child == cio);
 
 	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
 
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
 	pio->io_child_count--;
 	cio->io_parent_count--;
 
 	mutex_exit(&pio->io_lock);
 	mutex_exit(&cio->io_lock);
 
 	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 {
 	uint64_t *countp = &zio->io_children[child][wait];
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	if (*countp != 0) {
 		zio->io_stage >>= 1;
 		zio->io_stall = countp;
 		waiting = B_TRUE;
 	}
 	mutex_exit(&zio->io_lock);
 
 	return (waiting);
 }
 
 static void
 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 {
 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 	int *errorp = &pio->io_child_error[zio->io_child_type];
 
 	mutex_enter(&pio->io_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
 
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
 		zio_execute(pio);
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
 static void
 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 {
 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 		zio->io_error = zio->io_child_error[c];
 }
 
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_type_t type, zio_priority_t priority, enum zio_flag flags,
     vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
     enum zio_stage stage, enum zio_stage pipeline)
 {
 	zio_t *zio;
 
 	ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	bzero(zio, sizeof (zio_t));
 
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zio->io_parent_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_parent_node));
 	list_create(&zio->io_child_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_child_node));
 
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
 	else if (flags & ZIO_FLAG_DDT_CHILD)
 		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
 		zio->io_bp = (blkptr_t *)bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
 		if (type != ZIO_TYPE_WRITE ||
 		    zio->io_child_type == ZIO_CHILD_DDT)
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 			pipeline |= ZIO_GANG_STAGES;
 	}
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
 	zio->io_orig_data = zio->io_data = data;
 	zio->io_orig_size = zio->io_size = size;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
 	}
 
 	return (zio);
 }
 
 static void
 zio_destroy(zio_t *zio)
 {
 	list_destroy(&zio->io_parent_list);
 	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
 }
 
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
     void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	return (zio_null(NULL, spa, NULL, done, private, flags));
 }
 
 void
 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 {
 	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 		zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
 		    bp, (longlong_t)BP_GET_TYPE(bp));
 	}
 	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
 	    BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
 		zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
 		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
 	}
 	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
 	    BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
 		zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
 		    bp, (longlong_t)BP_GET_COMPRESS(bp));
 	}
 	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
 		    bp, (longlong_t)BP_GET_LSIZE(bp));
 	}
 	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
 		    bp, (longlong_t)BP_GET_PSIZE(bp));
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
 			zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
 			    bp, (longlong_t)BPE_GET_ETYPE(bp));
 		}
 	}
 
 	/*
 	 * Pool-specific checks.
 	 *
 	 * Note: it would be nice to verify that the blk_birth and
 	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
 	 * that are in the log) to be arbitrarily large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (vdevid >= spa->spa_root_vdev->vdev_children) {
 			zfs_panic_recover("blkptr at %p DVA %u has invalid "
 			    "VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 		if (vd == NULL) {
 			zfs_panic_recover("blkptr at %p DVA %u has invalid "
 			    "VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_hole_ops) {
 			zfs_panic_recover("blkptr at %p DVA %u has hole "
 			    "VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_missing_ops) {
 			/*
 			 * "missing" vdevs are valid during import, but we
 			 * don't have their detailed info (e.g. asize), so
 			 * we can't perform any more checks on them.
 			 */
 			continue;
 		}
 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
 		if (BP_IS_GANG(bp))
 			asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 		if (offset + asize > vd->vdev_asize) {
 			zfs_panic_recover("blkptr at %p DVA %u has invalid "
 			    "OFFSET %llu",
 			    bp, i, (longlong_t)offset);
 		}
 	}
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zfs_blkptr_verify(spa, bp);
 
 	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 	    data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
     void *private,
     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
 	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
 	zio->io_physdone = physdone;
 	zio->io_prop = *zp;
 
 	/*
 	 * Data can be NULL if we are going to call zio_write_override() to
 	 * provide the already-allocated BP.  But we may need the data to
 	 * verify a dedup hit (if requested).  In this case, don't try to
 	 * dedup (just take the already-allocated BP verbatim).
 	 */
 	if (data == NULL && zio->io_prop.zp_dedup_verify) {
 		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 
 	/*
 	 * We must reset the io_prop to match the values that existed
 	 * when the bp was first written by dmu_sync() keeping in mind
 	 * that nopwrite and dedup are mutually exclusive.
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
 
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
 	/*
 	 * The check for EMBEDDED is a performance optimization.  We
 	 * process the free here (by ignoring it) rather than
 	 * putting it on the list and then processing it in zio_free_sync().
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 	metaslab_check_free(spa, bp);
 
 	/*
 	 * Frees that are for the currently-syncing txg, are not going to be
 	 * deferred, and which will not need to do a read (i.e. not GANG or
 	 * DEDUP), can be processed immediately.  Otherwise, put them on the
 	 * in-memory list for later processing.
 	 */
 	if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
 		    BP_GET_PSIZE(bp), 0)));
 	}
 }
 
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     uint64_t size, enum zio_flag flags)
 {
 	zio_t *zio;
 	enum zio_stage stage = ZIO_FREE_PIPELINE;
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
 
 	if (zfs_trim_enabled)
 		stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
 		    ZIO_STAGE_VDEV_IO_ASSESS;
 	/*
 	 * GANG and DEDUP blocks can induce a read (for the gang block header,
 	 * or the DDT), so issue them asynchronously so that this thread is
 	 * not tied up.
 	 */
 	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 		stage |= ZIO_STAGE_ISSUE_ASYNC;
 
 	flags |= ZIO_FLAG_DONT_QUEUE;
 
 	zio = zio_create(pio, spa, txg, bp, NULL, size,
 	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 	    NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 
 	return (zio);
 }
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
 	dprintf_bp(bp, "claiming in txg %llu", txg);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
 	 * immediate writes contain committed data, but in a txg that was
 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
 	 * the intent log claims all blocks that contain immediate write data
 	 * so that the SPA knows they're in use.
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 	ASSERT(txg == spa_first_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags)
 {
 	zio_t *zio;
 	int c;
 
 	if (vd->vdev_children == 0) {
 		zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
 		    ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 
 		zio->io_cmd = cmd;
 	} else {
 		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 
 		for (c = 0; c < vd->vdev_children; c++)
 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 			    offset, size, done, private, priority, flags));
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 	    NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	return (zio);
 }
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 	    NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
 		 */
 		void *wbuf = zio_buf_alloc(size);
 		bcopy(data, wbuf, size);
 		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
 	return (zio);
 }
 
 /*
  * Create a child I/O to do some work for us.
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 	void *data, uint64_t size, int type, zio_priority_t priority,
 	enum zio_flag flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	ASSERT(vd->vdev_parent ==
 	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
 		 * checksum and the parent need not.  This pushes error
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
 	/* Not all IO types require vdev io done stage e.g. free */
 	if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
 		pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
 
 	if (vd->vdev_children == 0)
 		offset += VDEV_LABEL_START_SIZE;
 
 	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 
 	/*
 	 * If we've decided to do a repair, the write is not speculative --
 	 * even if the original read was.
 	 */
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 
 	zio->io_physdone = pio->io_physdone;
 	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
 		zio->io_logical->io_phys_children++;
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
     int type, zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_flush(zio_t *zio, vdev_t *vd)
 {
 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
 	    NULL, NULL, ZIO_PRIORITY_NOW,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
 zio_t *
 zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
 {
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL,
 	    ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
 	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
 }
 
 void
 zio_shrink(zio_t *zio, uint64_t size)
 {
 	ASSERT(zio->io_executor == NULL);
 	ASSERT(zio->io_orig_size == zio->io_size);
 	ASSERT(size <= zio->io_size);
 
 	/*
 	 * We don't shrink for raidz because of problems with the
 	 * reconstruction when reading back less than the block size.
 	 * Note, BP_IS_RAIDZ() assumes no compression.
 	 */
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	if (!BP_IS_RAIDZ(zio->io_bp))
 		zio->io_orig_size = zio->io_size = size;
 }
 
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
  * ==========================================================================
  */
 
 static int
 zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
 		uint64_t psize =
 		    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 		void *cbuf = zio_buf_alloc(psize);
 
 		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 	}
 
 	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		decode_embedded_bp_compressed(bp, zio->io_data);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
 	}
 
 	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
 	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_write_bp_init(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
 	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = zio->io_size;
 	uint64_t psize = lsize;
 	int pass = 1;
 
 	/*
 	 * If our children haven't all reached the ready stage,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
 		return (ZIO_PIPELINE_STOP);
 
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
 		ASSERT(bp->blk_birth != zio->io_txg);
 		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (BP_IS_EMBEDDED(bp))
 			return (ZIO_PIPELINE_CONTINUE);
 
 		/*
 		 * If we've been overridden and nopwrite is set then
 		 * set the flag accordingly to indicate that a nopwrite
 		 * has already occurred.
 		 */
 		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 			ASSERT(!zp->zp_dedup);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		ASSERT(!zp->zp_nopwrite);
 
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 		zio->io_bp_override = NULL;
 		BP_ZERO(bp);
 	}
 
 	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
 		 * converge, it must eventually be the case that we don't
 		 * have to allocate new blocks.  But compression changes
 		 * the blocksize, which forces a reallocate, and makes
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
 		pass = spa_sync_pass(spa);
 
 		ASSERT(zio->io_txg == spa_syncing_txg(spa));
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
 		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 	}
 
 	if (compress != ZIO_COMPRESS_OFF) {
 		void *cbuf = zio_buf_alloc(lsize);
 		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
 		if (psize == 0 || psize == lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			zio_buf_free(cbuf, lsize);
 		} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			zio_buf_free(cbuf, lsize);
 			bp->blk_birth = zio->io_txg;
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
 			return (ZIO_PIPELINE_CONTINUE);
 		} else {
 			/*
 			 * Round up compressed size up to the ashift
 			 * of the smallest-ashift device, and zero the tail.
 			 * This ensures that the compressed size of the BP
 			 * (and thus compressratio property) are correct,
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
 			ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
 			size_t rounded = (size_t)P2ROUNDUP(psize,
 			    1ULL << spa->spa_min_ashift);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
 				psize = lsize;
 			} else {
 				bzero((char *)cbuf + psize, rounded - psize);
 				psize = rounded;
 				zio_push_transform(zio, cbuf,
 				    psize, lsize, NULL);
 			}
 		}
 	}
 
 	/*
 	 * The final pass of spa_sync() must be all rewrites, but the first
 	 * few passes offer a trade-off: allocating blocks defers convergence,
 	 * but newly allocated blocks are sequential, so they can be written
 	 * to disk faster.  Therefore, we allow the first few passes of
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
 	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		ASSERT(psize != 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
 		BP_ZERO(bp);
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
 	if (psize == 0) {
 		if (zio->io_bp_orig.blk_birth != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
 			BP_SET_LEVEL(bp, zp->zp_level);
 			BP_SET_BIRTH(bp, zio->io_txg, 0);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
 		if (zp->zp_nopwrite) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_free_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		if (BP_GET_DEDUP(bp))
 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Execute the I/O pipeline
  * ==========================================================================
  */
 
 static void
 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
 	int flags = (cutinline ? TQ_FRONT : 0);
 
 	ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
 	 * interrupt threads may all be blocked waiting for the config lock.
 	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
 	 */
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
 	 * available.
 	 */
 	if (zio->io_priority == ZIO_PRIORITY_NOW &&
 	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 		q++;
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
 	/*
 	 * NB: We are assuming that the zio can only be dispatched
 	 * to a single taskq at a time.  It would be a grievous error
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 #if defined(illumos) || !defined(_KERNEL)
 	ASSERT(zio->io_tqent.tqent_next == NULL);
 #else
 	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
 #endif
 	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
 	    flags, &zio->io_tqent);
 }
 
 static boolean_t
 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	kthread_t *executor = zio->io_executor;
 	spa_t *spa = zio->io_spa;
 
 	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
 		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 		uint_t i;
 		for (i = 0; i < tqs->stqs_count; i++) {
 			if (taskq_member(tqs->stqs_taskq[i], executor))
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static int
 zio_issue_async(zio_t *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 
 	return (ZIO_PIPELINE_STOP);
 }
 
 void
 zio_interrupt(zio_t *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 void
 zio_delay_interrupt(zio_t *zio)
 {
 	/*
 	 * The timeout_generic() function isn't defined in userspace, so
 	 * rather than trying to implement the function, the zio delay
 	 * functionality has been disabled for userspace builds.
 	 */
 
 #ifdef _KERNEL
 	/*
 	 * If io_target_timestamp is zero, then no delay has been registered
 	 * for this IO, thus jump to the end of this function and "skip" the
 	 * delay; issuing it directly to the zio layer.
 	 */
 	if (zio->io_target_timestamp != 0) {
 		hrtime_t now = gethrtime();
 
 		if (now >= zio->io_target_timestamp) {
 			/*
 			 * This IO has already taken longer than the target
 			 * delay to complete, so we don't want to delay it
 			 * any longer; we "miss" the delay and issue it
 			 * directly to the zio layer. This is likely due to
 			 * the target latency being set to a value less than
 			 * the underlying hardware can satisfy (e.g. delay
 			 * set to 1ms, but the disks take 10ms to complete an
 			 * IO request).
 			 */
 
 			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
 			    hrtime_t, now);
 
 			zio_interrupt(zio);
 		} else {
 			hrtime_t diff = zio->io_target_timestamp - now;
 
 			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
 			    hrtime_t, now, hrtime_t, diff);
 
 			(void) timeout_generic(CALLOUT_NORMAL,
 			    (void (*)(void *))zio_interrupt, zio, diff, 1, 0);
 		}
 
 		return;
 	}
 #endif
 
 	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
 	zio_interrupt(zio);
 }
 
 /*
  * Execute the I/O pipeline until one of the following occurs:
  *
  *	(1) the I/O completes
  *	(2) the pipeline stalls waiting for dependent child I/Os
  *	(3) the I/O issues, so we're waiting for an I/O completion interrupt
  *	(4) the I/O is delegated by vdev-level caching or aggregation
  *	(5) the I/O is deferred due to vdev-level queueing
  *	(6) the I/O is handed off to another thread.
  *
  * In all cases, the pipeline stops whenever there's no CPU work; it never
  * burns a thread in cv_wait().
  *
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
 static zio_pipe_stage_t *zio_pipeline[];
 
 void
 zio_execute(zio_t *zio)
 {
 	zio->io_executor = curthread;
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
 		int rv;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
 		ASSERT(ISP2(stage));
 		ASSERT(zio->io_stall == NULL);
 
 		do {
 			stage <<= 1;
 		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		zio->io_stage = stage;
 		rv = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (rv == ZIO_PIPELINE_STOP)
 			return;
 
 		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
 	}
 }
 
 /*
  * ==========================================================================
  * Initiate I/O, either sync or async
  * ==========================================================================
  */
 int
 zio_wait(zio_t *zio)
 {
 	int error;
 
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_executor == NULL);
 
 	zio->io_waiter = curthread;
 
 	zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_executor != NULL)
 		cv_wait(&zio->io_cv, &zio->io_lock);
 	mutex_exit(&zio->io_lock);
 
 	error = zio->io_error;
 	zio_destroy(zio);
 
 	return (error);
 }
 
 void
 zio_nowait(zio_t *zio)
 {
 	ASSERT(zio->io_executor == NULL);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    zio_unique_parent(zio) == NULL) {
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
 		 * We add it to the spa_async_root_zio "Godfather" I/O which
 		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
 
 		zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * ==========================================================================
  * Reexecute or suspend/resume failed I/O
  * ==========================================================================
  */
 
 static void
 zio_reexecute(zio_t *pio)
 {
 	zio_t *cio, *cio_next;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 	ASSERT(pio->io_gang_leader == NULL);
 	ASSERT(pio->io_gang_tree == NULL);
 
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_error = 0;
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_state[w] = 0;
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
 	if (IO_IS_ALLOCATING(pio))
 		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
 	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
 	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio);
 		mutex_enter(&pio->io_lock);
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			pio->io_children[cio->io_child_type][w]++;
 		mutex_exit(&pio->io_lock);
 		zio_reexecute(cio);
 	}
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on him.
 	 */
 	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
 		zio_execute(pio);
 }
 
 void
 zio_suspend(spa_t *spa, zio_t *zio)
 {
 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
 	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
 
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = B_TRUE;
 
 	if (zio != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
 
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 int
 zio_resume(spa_t *spa)
 {
 	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
 	 */
 	mutex_enter(&spa->spa_suspend_lock);
 	spa->spa_suspended = B_FALSE;
 	cv_broadcast(&spa->spa_suspend_cv);
 	pio = spa->spa_suspend_zio_root;
 	spa->spa_suspend_zio_root = NULL;
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
 		return (0);
 
 	zio_reexecute(pio);
 	return (zio_wait(pio));
 }
 
 void
 zio_resume_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_suspend_lock);
 	while (spa_suspended(spa))
 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 /*
  * ==========================================================================
  * Gang blocks.
  *
  * A gang block is a collection of small blocks that looks to the DMU
  * like one large block.  When zio_dva_allocate() cannot find a block
  * of the requested size, due to either severe fragmentation or the pool
  * being nearly full, it calls zio_write_gang_block() to construct the
  * block from smaller fragments.
  *
  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
  * an indirect block: it's an array of block pointers.  It consumes
  * only one sector and hence is allocatable regardless of fragmentation.
  * The gang header's bps point to its gang members, which hold the data.
  *
  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
  * as the verifier to ensure uniqueness of the SHA256 checksum.
  * Critically, the gang block bp's blk_cksum is the checksum of the data,
  * not the gang header.  This ensures that data block signatures (needed for
  * deduplication) are independent of how the block is physically stored.
  *
  * Gang blocks can be nested: a gang member may itself be a gang block.
  * Thus every gang block is a tree in which root and all interior nodes are
  * gang headers, and the leaves are normal blocks that contain user data.
  * The root of the gang tree is called the gang leader.
  *
  * To perform any operation (read, rewrite, free, claim) on a gang block,
  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
  * in the io_gang_tree field of the original logical i/o by recursively
  * reading the gang leader and all gang headers below it.  This yields
  * an in-core tree containing the contents of every gang header and the
  * bps for every constituent of the gang block.
  *
  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
  * of the gang header plus zio_checksum_compute() of the data to update the
  * gang header's blk_cksum as described above.
  *
  * The two-phase assemble/issue model solves the problem of partial failure --
  * what if you'd freed part of a gang block but then couldn't read the
  * gang header for another part?  Assembling the entire gang tree first
  * ensures that all the necessary gang header I/O has succeeded before
  * starting the actual work of free, claim, or write.  Once the gang tree
  * is assembled, free and claim are in-memory operations that cannot fail.
  *
  * In the event that a gang write fails, zio_dva_unallocate() walks the
  * gang tree to immediately free (i.e. insert back into the space map)
  * everything we've allocated.  This ensures that we don't get ENOSPC
  * errors during repeated suspend/resume cycles due to a flaky device.
  *
  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
  * the gang tree, we won't modify the block, so we can safely defer the free
  * (knowing that the block is still intact).  If we *can* assemble the gang
  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
  * each constituent bp and we can allocate a new block on the next sync pass.
  *
  * In all cases, the gang tree allows complete recovery from partial failure.
  * ==========================================================================
  */
 
 static zio_t *
 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
 	if (gn != NULL)
 		return (pio);
 
 	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
 	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 	    &pio->io_bookmark));
 }
 
 zio_t *
 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
 	zio_t *zio;
 
 	if (gn != NULL) {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
 		 * a new gang block header checksum for it; but no one will
 		 * compute a new data checksum, so we do that here.  The one
 		 * exception is the gang leader: the pipeline already computed
 		 * its data checksum because that stage precedes gang assembly.
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
 		if (gn != pio->io_gang_leader->io_gang_tree) {
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    data, BP_GET_PSIZE(bp));
 		}
 		/*
 		 * If we are here to damage data for testing purposes,
 		 * leave the GBH alone so that we can detect the damage.
 		 */
 		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 	}
 
 	return (zio);
 }
 
 /* ARGSUSED */
 zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
 	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
 	    ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 /* ARGSUSED */
 zio_t *
 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
 	NULL,
 	zio_read_gang,
 	zio_rewrite_gang,
 	zio_free_gang,
 	zio_claim_gang,
 	NULL
 };
 
 static void zio_gang_tree_assemble_done(zio_t *zio);
 
 static zio_gang_node_t *
 zio_gang_node_alloc(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn;
 
 	ASSERT(*gnpp == NULL);
 
 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
 	*gnpp = gn;
 
 	return (gn);
 }
 
 static void
 zio_gang_node_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		ASSERT(gn->gn_child[g] == NULL);
 
 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 	kmem_free(gn, sizeof (*gn));
 	*gnpp = NULL;
 }
 
 static void
 zio_gang_tree_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	if (gn == NULL)
 		return;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		zio_gang_tree_free(&gn->gn_child[g]);
 
 	zio_gang_node_free(gnpp);
 }
 
 static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
 	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
 	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
 	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
 	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
 	ASSERT(zio->io_child_count == 0);
 
 	if (zio->io_error)
 		return;
 
 	if (BP_SHOULD_BYTESWAP(bp))
 		byteswap_uint64_array(zio->io_data, zio->io_size);
 
 	ASSERT(zio->io_data == gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
 {
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 			if (BP_IS_HOLE(gbp))
 				continue;
 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
 			data = (char *)data + BP_GET_PSIZE(gbp);
 		}
 	}
 
 	if (gn == gio->io_gang_tree && gio->io_data != NULL)
 		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
 
 	if (zio != pio)
 		zio_nowait(zio);
 }
 
 static int
 zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
 		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
 	else
 		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	zio_t *gio = zio->io_gang_leader;
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
 
 	if (BP_IS_HOLE(zio->io_bp))
 		return;
 
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
 		ASSERT(DVA_GET_GANG(&pdva[d]));
 		asize = DVA_GET_ASIZE(&pdva[d]);
 		asize += DVA_GET_ASIZE(&cdva[d]);
 		DVA_SET_ASIZE(&pdva[d], asize);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 static int
 zio_write_gang_block(zio_t *pio)
 {
 	spa_t *spa = pio->io_spa;
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
 	int copies = gio->io_prop.zp_copies;
 	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
 	zio_prop_t zp;
 	int error;
 
 	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
 	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
 	if (error) {
 		pio->io_error = error;
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	if (pio == gio) {
 		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
 	}
 
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
 	bzero(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
 	 * Create the gang header.
 	 */
 	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
 	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	/*
 	 * Create and nowait the gang children.
 	 */
 	for (int g = 0; resid != 0; resid -= lsize, g++) {
 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
 		    SPA_MINBLOCKSIZE);
 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
 
 		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
 		zp.zp_dedup = B_FALSE;
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;
 
 		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
 		    zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 		    &pio->io_bookmark));
 	}
 
 	/*
 	 * Set pio's pipeline to just wait for zio to finish.
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	zio_nowait(zio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * The zio_nop_write stage in the pipeline determines if allocating a
  * new bp is necessary.  The nopwrite feature can handle writes in
  * either syncing or open context (i.e. zil writes) and as a result is
  * mutually exclusive with dedup.
  *
  * By leveraging a cryptographically secure checksum, such as SHA256, we
  * can compare the checksums of the new data and the old to determine if
  * allocating a new block is required.  Note that our requirements for
  * cryptographic strength are fairly weak: there can't be any accidental
  * hash collisions, but we don't need to be secure against intentional
  * (malicious) collisions.  To trigger a nopwrite, you have to be able
  * to write the file to begin with, and triggering an incorrect (hash
  * collision) nopwrite is no worse than simply writing to the file.
  * That said, there are no known attacks against the checksum algorithms
  * used for nopwrite, assuming that the salt and the checksums
  * themselves remain secret.
  */
 static int
 zio_nop_write(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
 
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(zp->zp_nopwrite);
 	ASSERT(!zp->zp_dedup);
 	ASSERT(zio->io_bp_override == NULL);
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Check to see if the original bp and the new bp have matching
 	 * characteristics (i.e. same checksum, compression algorithms, etc).
 	 * If they don't then just continue with the pipeline which will
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
 	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
 	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
 	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
 		return (ZIO_PIPELINE_CONTINUE);
 
 	/*
 	 * If the checksums match then reset the pipeline so that we
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
 		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
 		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
 		    sizeof (uint64_t)) == 0);
 
 		*bp = *bp_orig;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Dedup
  * ==========================================================================
  */
 static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddp = ddt_phys_select(dde, bp);
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
 		dde->dde_repair_data = zio->io_data;
 	else
 		zio_buf_free(zio->io_data, zio->io_size);
 	mutex_exit(&pio->io_lock);
 }
 
 static int
 zio_ddt_read_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
 		ddt_phys_t *ddp = dde->dde_phys;
 		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
 		if (ddp_self == NULL)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
 				continue;
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
 			    &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    zio_buf_alloc(zio->io_size), zio->io_size,
 			    zio_ddt_child_read_done, dde, zio->io_priority,
 			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
 			    &zio->io_bookmark));
 		}
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
 	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (ZIO_PIPELINE_STOP);
 		}
 		if (dde->dde_repair_data != NULL) {
 			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
 		ddt_repair_done(ddt, dde);
 		zio->io_vsd = NULL;
 	}
 
 	ASSERT(zio->io_vsd == NULL);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static boolean_t
 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 {
 	spa_t *spa = zio->io_spa;
 
 	/*
 	 * Note: we compare the original data, not the transformed data,
 	 * because when zio->io_bp is an override bp, we will not have
 	 * pushed the I/O transforms.  That's an important optimization
 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
 	 */
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		zio_t *lio = dde->dde_lead_zio[p];
 
 		if (lio != NULL) {
 			return (lio->io_orig_size != zio->io_orig_size ||
 			    bcmp(zio->io_orig_data, lio->io_orig_data,
 			    zio->io_orig_size) != 0);
 		}
 	}
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 
 		if (ddp->ddp_phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 
 			ddt_exit(ddt);
 
 			error = arc_read(NULL, spa, &blk,
 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zio->io_bookmark);
 
 			if (error == 0) {
 				if (arc_buf_size(abuf) != zio->io_orig_size ||
 				    bcmp(abuf->b_data, zio->io_orig_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(EEXIST);
 				VERIFY(arc_buf_remove_ref(abuf, &abuf));
 			}
 
 			ddt_enter(ddt);
 			return (error != 0);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 	zio_t *pio;
 
 	if (zio->io_error)
 		return;
 
 	ddt_enter(ddt);
 
 	ASSERT(dde->dde_lead_zio[p] == zio);
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
 	while ((pio = zio_walk_parents(zio)) != NULL)
 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
 	ASSERT(dde->dde_lead_zio[p] == zio);
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		while (zio_walk_parents(zio) != NULL)
 			ddt_phys_addref(ddp);
 	} else {
 		ddt_phys_clear(ddp);
 	}
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_ditto_write_done(zio_t *zio)
 {
 	int p = DDT_PHYS_DITTO;
 	zio_prop_t *zp = &zio->io_prop;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(zio->io_spa, bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 	ddt_key_t *ddk = &dde->dde_key;
 
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
 	ASSERT(dde->dde_lead_zio[p] == zio);
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
 		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
 		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
 		if (ddp->ddp_phys_birth != 0)
 			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
 		ddt_phys_fill(ddp, bp);
 	}
 
 	ddt_exit(ddt);
 }
 
 static int
 zio_ddt_write(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
 	int p = zp->zp_copies;
 	int ditto_copies;
 	zio_t *cio = NULL;
 	zio_t *dio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
 
 	ddt_enter(ddt);
 	dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = &dde->dde_phys[p];
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
 		 * and try again.  If we're already using a strong checksum,
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
 		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
 			zp->zp_dedup = B_FALSE;
 		}
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
 	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
 
 	if (ditto_copies > ddt_ditto_copies_present(dde) &&
 	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
 		zio_prop_t czp = *zp;
 
 		czp.zp_copies = ditto_copies;
 
 		/*
 		 * If we arrived here with an override bp, we won't have run
 		 * the transform stack, so we won't have the data we need to
 		 * generate a child i/o.  So, toss the override bp and restart.
 		 * This is safe, because using the override bp is just an
 		 * optimization; and it's rare, so the cost doesn't matter.
 		 */
 		if (zio->io_bp_override) {
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			zio->io_pipeline = ZIO_WRITE_PIPELINE;
 			zio->io_bp_override = NULL;
 			BP_ZERO(bp);
 			ddt_exit(ddt);
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
 		    zio->io_orig_size, &czp, NULL, NULL,
 		    zio_ddt_ditto_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
 	}
 
 	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
 		if (ddp->ddp_phys_birth != 0)
 			ddt_bp_fill(ddp, bp, txg);
 		if (dde->dde_lead_zio[p] != NULL)
 			zio_add_child(zio, dde->dde_lead_zio[p]);
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
 		ASSERT(bp->blk_birth == txg);
 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
 		    zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[p] = cio;
 	}
 
 	ddt_exit(ddt);
 
 	if (cio)
 		zio_nowait(cio);
 	if (dio)
 		zio_nowait(dio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 ddt_entry_t *freedde; /* for debugging */
 
 static int
 zio_ddt_free(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = ddt_phys_select(dde, bp);
 	ddt_phys_decref(ddp);
 	ddt_exit(ddt);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Allocate and free blocks
  * ==========================================================================
  */
 static int
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
 	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 		zio->io_gang_leader = zio;
 	}
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT0(BP_GET_NDVAS(bp));
 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	/*
 	 * The dump device does not support gang blocks so allocation on
 	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
 	 * the "fast" gang feature.
 	 */
 	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
 	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
 	    METASLAB_GANG_CHILD : 0;
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
 
 	if (error) {
 		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
 		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
 		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
 			return (zio_write_gang_block(zio));
 		zio->io_error = error;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_dva_free(zio_t *zio)
 {
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_dva_claim(zio_t *zio)
 {
 	int error;
 
 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 	if (error)
 		zio->io_error = error;
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * Undo an allocation.  This is used by zio_done() when an I/O fails
  * and we want to give back the block we just allocated.
  * This handles both normal blocks and gang blocks.
  */
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
 	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp))
 		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			zio_dva_unallocate(zio, gn->gn_child[g],
 			    &gn->gn_gbh->zg_blkptr[g]);
 		}
 	}
 }
 
 /*
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
     uint64_t size, boolean_t use_slog)
 {
 	int error = 1;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	/*
 	 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
 	 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
 	 * when allocating them.
 	 */
 	if (use_slog) {
 		error = metaslab_alloc(spa, spa_log_class(spa), size,
 		    new_bp, 1, txg, old_bp,
 		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
 	}
 
 	if (error) {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, old_bp,
 		    METASLAB_HINTBP_AVOID);
 	}
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
 		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
 		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 	}
 
 	return (error);
 }
 
 /*
  * Free an intent log block.
  */
 void
 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
 {
 	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
 	ASSERT(!BP_IS_GANG(bp));
 
 	zio_free(spa, txg, bp);
 }
 
 /*
  * ==========================================================================
  * Read, write and delete to physical devices
  * ==========================================================================
  */
 
 
 /*
  * Issue an I/O to the underlying vdev. Typically the issue pipeline
  * stops after this stage and will resume upon I/O completion.
  * However, there are instances where the vdev layer may need to
  * continue the pipeline when an I/O was not issued. Since the I/O
  * that was sent to the vdev layer might be different than the one
  * currently active in the pipeline (see vdev_queue_io()), we explicitly
  * force the underlying vdev layers to call either zio_execute() or
  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
  */
 static int
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
 	int ret;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
 		vdev_mirror_ops.vdev_op_io_start(zio);
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
 	    zio->io_priority == ZIO_PRIORITY_NOW) {
 		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	/*
 	 * We keep track of time-sensitive I/Os so that the scan thread
 	 * can quickly react to certain workloads.  In particular, we care
 	 * about non-scrubbing, top-level reads and writes with the following
 	 * characteristics:
 	 *	- synchronous writes of user data to non-slog devices
 	 *	- any reads of user data
 	 * When these conditions are met, adjust the timestamp of spa_last_io
 	 * which allows the scan thread to adjust its workload accordingly.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
 	    vd == vd->vdev_top && !vd->vdev_islog &&
 	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
 	    zio->io_txg != spa_syncing_txg(spa)) {
 		uint64_t old = spa->spa_last_io;
 		uint64_t new = ddi_get_lbolt64();
 		if (old != new)
 			(void) atomic_cas_64(&spa->spa_last_io, old, new);
 	}
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
 	    P2PHASE(zio->io_size, align) != 0) {
 		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		char *abuf = NULL;
 		if (zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_WRITE)
 			abuf = zio_buf_alloc(asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			bcopy(zio->io_data, abuf, zio->io_size);
 			bzero(abuf + zio->io_size, asize - zio->io_size);
 		}
 		zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
 		    zio_subblock);
 	}
 
 	/*
 	 * If this is not a physical io, make sure that it is properly aligned
 	 * before proceeding.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
 		ASSERT0(P2PHASE(zio->io_offset, align));
 		ASSERT0(P2PHASE(zio->io_size, align));
 	} else {
 		/*
 		 * For physical writes, we allow 512b aligned writes and assume
 		 * the device will perform a read-modify-write as necessary.
 		 */
 		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
 		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
 	}
 
 	VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
 
 	/*
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
 	 * This prevents spurious resilvering with nested replication.
 	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
 	 * A is out of date, we'll read from C+D, then use the data to
 	 * resilver A+B -- but we don't actually want to resilver B, just A.
 	 * The top-level mirror has no way to know this, so instead we just
 	 * discard unnecessary repairs as we work our way down the vdev tree.
 	 * The same logic applies to any form of nested replication:
 	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		switch (zio->io_type) {
 		case ZIO_TYPE_READ:
 			if (vdev_cache_read(zio))
 				return (ZIO_PIPELINE_CONTINUE);
 			/* FALLTHROUGH */
 		case ZIO_TYPE_WRITE:
 		case ZIO_TYPE_FREE:
 			if ((zio = vdev_queue_io(zio)) == NULL)
 				return (ZIO_PIPELINE_STOP);
 
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 				zio_interrupt(zio);
 				return (ZIO_PIPELINE_STOP);
 			}
 			break;
 		}
 		/*
 		 * Note that we ignore repair writes for TRIM because they can
 		 * conflict with normal writes. This isn't an issue because, by
 		 * definition, we only repair blocks that aren't freed.
 		 */
 		if (zio->io_type == ZIO_TYPE_WRITE &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 		    !trim_map_write_start(zio))
 			return (ZIO_PIPELINE_STOP);
 	}
 
 	vd->vdev_ops->vdev_op_io_start(zio);
 	return (ZIO_PIPELINE_STOP);
 }
 
 static int
 zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_FREE)) {
 
 		if (zio->io_type == ZIO_TYPE_WRITE &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
 			trim_map_write_done(zio);
 
 		vdev_queue_io_done(zio);
 
 		if (zio->io_type == ZIO_TYPE_WRITE)
 			vdev_cache_write(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injection(vd,
 			    zio, EIO);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error) {
 			if (zio->io_error == ENOTSUP &&
 			    zio->io_type == ZIO_TYPE_FREE) {
 				/* Not all devices support TRIM. */
 			} else if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
 			}
 		}
 	}
 
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * For non-raidz ZIOs, we can just copy aside the bad data read from the
  * disk, and use that to finish the checksum ereport later.
  */
 static void
 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
     const void *good_buf)
 {
 	/* no processing needed */
 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
 }
 
 /*ARGSUSED*/
 void
 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
 {
 	void *buf = zio_buf_alloc(zio->io_size);
 
 	bcopy(zio->io_data, buf, zio->io_size);
 
 	zcr->zcr_cbinfo = zio->io_size;
 	zcr->zcr_cbdata = buf;
 	zcr->zcr_finish = zio_vsd_default_cksum_finish;
 	zcr->zcr_free = zio_buf_free;
 }
 
 static int
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
 	if (zio->io_vsd != NULL) {
 		zio->io_vsd_ops->vsd_free(zio);
 		zio->io_vsd = NULL;
 	}
 
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
 	if (zio->io_type == ZIO_TYPE_FREE &&
 	    zio->io_priority != ZIO_PRIORITY_NOW) {
 		switch (zio->io_error) {
 		case 0:
 			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
 			ZIO_TRIM_STAT_BUMP(success);
 			break;
 		case EOPNOTSUPP:
 			ZIO_TRIM_STAT_BUMP(unsupported);
 			break;
 		default:
 			ZIO_TRIM_STAT_BUMP(failed);
 			break;
 		}
 	}
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 *
 	 * On retry, we cut in line in the issue queue, since we don't want
 	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	/*
 	 * If we got an error on a leaf device, convert it to ENXIO
 	 * if the device is not accessible at all.
 	 */
 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    !vdev_accessible(vd, zio))
 		zio->io_error = SET_ERROR(ENXIO);
 
 	/*
 	 * If we can't write to an interior vdev (mirror or RAID-Z),
 	 * set vdev_cant_write so that we stop trying to allocate from it.
 	 */
 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
 		vd->vdev_cant_write = B_TRUE;
 	}
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    zio->io_physdone != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
 		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
 		zio->io_physdone(zio->io_logical);
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 void
 zio_vdev_io_reissue(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_redone(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_bypass(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
  * ==========================================================================
  * Generate and verify checksums
  * ==========================================================================
  */
 static int
 zio_checksum_generate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum;
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_write_phys().
 		 * We're either generating a label checksum, or none at all.
 		 */
 		checksum = zio->io_prop.zp_checksum;
 
 		if (checksum == ZIO_CHECKSUM_OFF)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
 	} else {
 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
 			ASSERT(!IO_IS_ALLOCATING(zio));
 			checksum = ZIO_CHECKSUM_GANG_HEADER;
 		} else {
 			checksum = BP_GET_CHECKSUM(bp);
 		}
 	}
 
 	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
 	ASSERT(zio->io_vd != NULL);
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
 		 * We're either verifying a label checksum, or nothing at all.
 		 */
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
 	}
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			zfs_ereport_start_checksum(zio->io_spa,
 			    zio->io_vd, zio, zio->io_offset,
 			    zio->io_size, NULL, &info);
 		}
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * Called by RAID-Z to ensure we don't compute the checksum twice.
  */
 void
 zio_checksum_verified(zio_t *zio)
 {
 	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
  * ==========================================================================
  */
 int
 zio_worst_error(int e1, int e2)
 {
 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
 	int r1, r2;
 
 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
 		if (e1 == zio_error_rank[r1])
 			break;
 
 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
 		if (e2 == zio_error_rank[r2])
 			break;
 
 	return (r1 > r2 ? e1 : e2);
 }
 
 /*
  * ==========================================================================
  * I/O completion
  * ==========================================================================
  */
 static int
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
 		return (ZIO_PIPELINE_STOP);
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
 		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
 	}
 
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
 	pio = zio_walk_parents(zio);
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * As we notify zio's parents, new parents could be added.
 	 * New parents go to the head of zio's io_parent_list, however,
 	 * so we will (correctly) not notify them.  The remainder of zio's
 	 * io_parent_list, from 'pio_next' onward, cannot change because
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
 		pio_next = zio_walk_parents(zio);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		if (BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
 			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		}
 	}
 
 	if (zio_injection_enabled &&
 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
 		zio_handle_ignored_writes(zio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_t *lio = zio->io_logical;
 	blkptr_t *bp = zio->io_bp;
 	vdev_t *vd = zio->io_vd;
 	uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
 
 	/*
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
 	if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
 		ASSERT(bp->blk_pad[0] == 0);
 		ASSERT(bp->blk_pad[1] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
 		    (bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
 		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
 			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
 		}
 		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
 			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
 	}
 
 	/*
 	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
 	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
 
 	/*
 	 * If the I/O on the transformed data was successful, generate any
 	 * checksum reports now while we still have the transformed data.
 	 */
 	if (zio->io_error == 0) {
 		while (zio->io_cksum_report != NULL) {
 			zio_cksum_report_t *zcr = zio->io_cksum_report;
 			uint64_t align = zcr->zcr_align;
 			uint64_t asize = P2ROUNDUP(psize, align);
 			char *abuf = zio->io_data;
 
 			if (asize != psize) {
 				abuf = zio_buf_alloc(asize);
 				bcopy(zio->io_data, abuf, psize);
 				bzero(abuf + psize, asize - psize);
 			}
 
 			zio->io_cksum_report = zcr->zcr_next;
 			zcr->zcr_next = NULL;
 			zcr->zcr_finish(zcr, abuf);
 			zfs_ereport_free_checksum(zcr);
 
 			if (asize != psize)
 				zio_buf_free(abuf, asize);
 		}
 	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
 	vdev_stat_update(zio, psize);
 
 	if (zio->io_error) {
 		/*
 		 * If this I/O is attached to a particular vdev,
 		 * generate an error message describing the I/O failure
 		 * at the block level.  We ignore these errors if the
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
 		    zio == lio) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(spa, zio);
 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
 			    0, 0);
 		}
 	}
 
 	if (zio->io_error && zio == lio) {
 		/*
 		 * Determine whether zio should be reexecuted.  This will
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(vd == NULL && bp != NULL);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		if (IO_IS_ALLOCATING(zio) &&
 		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		/*
 		 * Here is a possibly good place to attempt to do
 		 * either combinatorial reconstruction or error correction
 		 * based on checksums.  It also might be a good place
 		 * to send out preliminary ereports before we suspend
 		 * processing.
 		 */
 	}
 
 	/*
 	 * If there were logical child errors, they apply to us now.
 	 * We defer this until now to avoid conflating logical child
 	 * errors with errors that happened to the zio itself when
 	 * updating vdev stats and reporting FMA events above.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
 	/*
 	 * Godfather I/Os should never suspend.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
 		zio->io_reexecute = 0;
 
 	if (zio->io_reexecute) {
 		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
 		 * Reexecute is top-down.  When an i/o fails, if it's not
 		 * the root, it simply notifies its parent and sticks around.
 		 * The parent, seeing that it still has children in zio_done(),
 		 * does the same.  This percolates all the way up to the root.
 		 * The root i/o will reexecute or suspend the entire tree.
 		 *
 		 * This approach ensures that zio_reexecute() honors
 		 * all the original i/o dependency relationships, e.g.
 		 * parents not executing until children are ready.
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		zio->io_gang_leader = NULL;
 
 		mutex_enter(&zio->io_lock);
 		zio->io_state[ZIO_WAIT_DONE] = 1;
 		mutex_exit(&zio->io_lock);
 
 		/*
 		 * "The Godfather" I/O monitors its children but is
 		 * not a true parent to them. It will track them through
 		 * the pipeline but severs its ties whenever they get into
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
 		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
 			zio_link_t *zl = zio->io_walk_link;
 			pio_next = zio_walk_parents(zio);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
 				zio_remove_child(pio, zio, zl);
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 			}
 		}
 
 		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
 			 * until conditions improve (e.g. device comes online).
 			 */
 			zio_suspend(spa, zio);
 		} else {
 			/*
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
 #if defined(illumos) || !defined(_KERNEL)
 			ASSERT(zio->io_tqent.tqent_next == NULL);
 #else
 			ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
 #endif
 			spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
 			    ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
 			    0, &zio->io_tqent);
 		}
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	ASSERT(zio->io_child_count == 0);
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*
 	 * Report any checksum errors, since the I/O is complete.
 	 */
 	while (zio->io_cksum_report != NULL) {
 		zio_cksum_report_t *zcr = zio->io_cksum_report;
 		zio->io_cksum_report = zcr->zcr_next;
 		zcr->zcr_next = NULL;
 		zcr->zcr_finish(zcr, NULL);
 		zfs_ereport_free_checksum(zcr);
 	}
 
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
 	 * such, cannot acquire any new parents.
 	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
 	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
 		zio_link_t *zl = zio->io_walk_link;
 		pio_next = zio_walk_parents(zio);
 		zio_remove_child(pio, zio, zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 	}
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
 		zio->io_executor = NULL;
 		cv_broadcast(&zio->io_cv);
 		mutex_exit(&zio->io_lock);
 	} else {
 		zio_destroy(zio);
 	}
 
 	return (ZIO_PIPELINE_STOP);
 }
 
 /*
  * ==========================================================================
  * I/O pipeline definition
  * ==========================================================================
  */
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
 	zio_write_bp_init,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
 	zio_ready,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
 	zio_checksum_verify,
 	zio_done
 };
 
 
 
 
 /*
  * Compare two zbookmark_phys_t's to see which we would reach first in a
  * pre-order traversal of the object tree.
  *
  * This is simple in every case aside from the meta-dnode object. For all other
  * objects, we traverse them in order (object 1 before object 2, and so on).
  * However, all of these objects are traversed while traversing object 0, since
  * the data it points to is the list of objects.  Thus, we need to convert to a
  * canonical representation so we can compare meta-dnode bookmarks to
  * non-meta-dnode bookmarks.
  *
  * We do this by calculating "equivalents" for each field of the zbookmark.
  * zbookmarks outside of the meta-dnode use their own object and level, and
  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
  * blocks this bookmark refers to) by multiplying their blkid by their span
  * (the number of L0 blocks contained within one block at their level).
  * zbookmarks inside the meta-dnode calculate their object equivalent
  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
  * level + 1<<31 (any value larger than a level could ever be) for their level.
  * This causes them to always compare before a bookmark in their object
  * equivalent, compare appropriately to bookmarks in other objects, and to
  * compare appropriately to other bookmarks in the meta-dnode.
  */
 int
 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
 {
 	/*
 	 * These variables represent the "equivalent" values for the zbookmark,
 	 * after converting zbookmarks inside the meta dnode to their
 	 * normal-object equivalents.
 	 */
 	uint64_t zb1obj, zb2obj;
 	uint64_t zb1L0, zb2L0;
 	uint64_t zb1level, zb2level;
 
 	if (zb1->zb_object == zb2->zb_object &&
 	    zb1->zb_level == zb2->zb_level &&
 	    zb1->zb_blkid == zb2->zb_blkid)
 		return (0);
 
 	/*
 	 * BP_SPANB calculates the span in blocks.
 	 */
 	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
 	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb1L0 = 0;
 		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb1obj = zb1->zb_object;
 		zb1level = zb1->zb_level;
 	}
 
 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
 		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb2L0 = 0;
 		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb2obj = zb2->zb_object;
 		zb2level = zb2->zb_level;
 	}
 
 	/* Now that we have a canonical representation, do the comparison. */
 	if (zb1obj != zb2obj)
 		return (zb1obj < zb2obj ? -1 : 1);
 	else if (zb1L0 != zb2L0)
 		return (zb1L0 < zb2L0 ? -1 : 1);
 	else if (zb1level != zb2level)
 		return (zb1level > zb2level ? -1 : 1);
 	/*
 	 * This can (theoretically) happen if the bookmarks have the same object
 	 * and level, but different blkids, if the block sizes are not the same.
 	 * There is presently no way to change the indirect block sizes
 	 */
 	return (0);
 }
 
 /*
  *  This function checks the following: given that last_block is the place that
  *  our traversal stopped last time, does that guarantee that we've visited
  *  every node under subtree_root?  Therefore, we can't just use the raw output
  *  of zbookmark_compare.  We have to pass in a modified version of
  *  subtree_root; by incrementing the block id, and then checking whether
  *  last_block is before or equal to that, we can tell whether or not having
  *  visited last_block implies that all of subtree_root's children have been
  *  visited.
  */
 boolean_t
 zbookmark_subtree_completed(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	zbookmark_phys_t mod_zb = *subtree_root;
 	mod_zb.zb_blkid++;
 	ASSERT(last_block->zb_level == 0);
 
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);
 
 	/*
 	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
 	 * data block size in sectors, because that variable is only used if
 	 * the bookmark refers to a block in the meta-dnode.  Since we don't
 	 * know without examining it what object it refers to, and there's no
 	 * harm in passing in this value in other cases, we always pass it in.
 	 *
 	 * We pass in 0 for the indirect block size shift because zb2 must be
 	 * level 0.  The indirect block size is only used to calculate the span
 	 * of the bookmark, but since the bookmark must be level 0, the span is
 	 * always 1, so the math works out.
 	 *
 	 * If you make changes to how the zbookmark_compare code works, be sure
 	 * to make sure that this code still works afterwards.
 	 */
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
 	    last_block) <= 0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	(revision 296519)
@@ -1,3209 +1,3210 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  *
  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Portions Copyright 2010 Robert Milkowski
  *
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
 
 /*
  * ZFS volume emulation driver.
  *
  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  * Volumes are accessed through the symbolic links named:
  *
  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  *
  * These links are created by the /dev filesystem (sdev_zvolops.c).
  * Volumes are persistent through reboot.  No user command needs to be
  * run before opening and using a device.
  *
  * FreeBSD notes.
  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
  * in the system.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/disk.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dnode.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dkio.h>
 #include <sys/byteorder.h>
 #include <sys/sunddi.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/queue.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zil.h>
 #include <sys/refcount.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_rlock.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/zil_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_tx.h>
 #include <sys/zfeature.h>
 #include <sys/zio_checksum.h>
 #include <sys/filio.h>
 
 #include <geom/geom.h>
 
 #include "zfs_namecheck.h"
 
 #ifndef illumos
 struct g_class zfs_zvol_class = {
 	.name = "ZFS::ZVOL",
 	.version = G_VERSION,
 };
 
 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
 
 #endif
 void *zfsdev_state;
 static char *zvol_tag = "zvol_tag";
 
 #define	ZVOL_DUMPSIZE		"dumpsize"
 
 /*
  * This lock protects the zfsdev_state structure from being modified
  * while it's being used, e.g. an open that comes in before a create
  * finishes.  It also protects temporary opens of the dataset so that,
  * e.g., an open doesn't get a spurious EBUSY.
  */
 #ifdef illumos
 kmutex_t zfsdev_state_lock;
 #else
 /*
  * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
  * spa_namespace_lock in the ZVOL code.
  */
 #define zfsdev_state_lock spa_namespace_lock
 #endif
 static uint32_t zvol_minors;
 
 #ifndef illumos
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
 static int	volmode = ZFS_VOLMODE_GEOM;
 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
     "Expose as GEOM providers (1), device files (2) or neither");
 static boolean_t zpool_on_zvol = B_FALSE;
 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
     "Allow zpools to use zvols as vdevs (DANGEROUS)");
 
 #endif
 typedef struct zvol_extent {
 	list_node_t	ze_node;
 	dva_t		ze_dva;		/* dva associated with this extent */
 	uint64_t	ze_nblks;	/* number of blocks in extent */
 } zvol_extent_t;
 
 /*
  * The in-core state of each volume.
  */
 typedef struct zvol_state {
 #ifndef illumos
 	LIST_ENTRY(zvol_state)	zv_links;
 #endif
 	char		zv_name[MAXPATHLEN]; /* pool/dd name */
 	uint64_t	zv_volsize;	/* amount of space we advertise */
 	uint64_t	zv_volblocksize; /* volume block size */
 #ifdef illumos
 	minor_t		zv_minor;	/* minor number */
 #else
 	struct cdev	*zv_dev;	/* non-GEOM device */
 	struct g_provider *zv_provider;	/* GEOM provider */
 #endif
 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
 	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
 	objset_t	*zv_objset;	/* objset handle */
 #ifdef illumos
 	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
 #endif
 	uint32_t	zv_total_opens;	/* total open count */
 	zilog_t		*zv_zilog;	/* ZIL handle */
 	list_t		zv_extents;	/* List of extents for dump */
 	znode_t		zv_znode;	/* for range locking */
 	dmu_buf_t	*zv_dbuf;	/* bonus handle */
 #ifndef illumos
 	int		zv_state;
 	int		zv_volmode;	/* Provide GEOM or cdev */
 	struct bio_queue_head zv_queue;
 	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
 #endif
 } zvol_state_t;
 
 #ifndef illumos
 static LIST_HEAD(, zvol_state) all_zvols;
 #endif
 /*
  * zvol specific flags
  */
 #define	ZVOL_RDONLY	0x1
 #define	ZVOL_DUMPIFIED	0x2
 #define	ZVOL_EXCL	0x4
 #define	ZVOL_WCE	0x8
 
 /*
  * zvol maximum transfer in one DMU tx.
  */
 int zvol_maxphys = DMU_MAX_ACCESS/2;
 
 /*
  * Toggle unmap functionality.
  */
 boolean_t zvol_unmap_enabled = B_TRUE;
 #ifndef illumos
 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
     &zvol_unmap_enabled, 0,
     "Enable UNMAP functionality");
 
 static d_open_t		zvol_d_open;
 static d_close_t	zvol_d_close;
 static d_read_t		zvol_read;
 static d_write_t	zvol_write;
 static d_ioctl_t	zvol_d_ioctl;
 static d_strategy_t	zvol_strategy;
 
 static struct cdevsw zvol_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	zvol_d_open,
 	.d_close =	zvol_d_close,
 	.d_read =	zvol_read,
 	.d_write =	zvol_write,
 	.d_ioctl =	zvol_d_ioctl,
 	.d_strategy =	zvol_strategy,
 	.d_name =	"zvol",
 	.d_flags =	D_DISK | D_TRACKCLOSE,
 };
 
 static void zvol_geom_run(zvol_state_t *zv);
 static void zvol_geom_destroy(zvol_state_t *zv);
 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
 static void zvol_geom_start(struct bio *bp);
 static void zvol_geom_worker(void *arg);
 static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
     uint64_t len, boolean_t sync);
 #endif	/* !illumos */
 
 extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
     nvlist_t *, nvlist_t *);
 static int zvol_remove_zv(zvol_state_t *);
 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
 static int zvol_dumpify(zvol_state_t *zv);
 static int zvol_dump_fini(zvol_state_t *zv);
 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 
 static void
 zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 {
 #ifdef illumos
 	dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
 
 	zv->zv_volsize = volsize;
 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 	    "Size", volsize) == DDI_SUCCESS);
 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
 
 	/* Notify specfs to invalidate the cached size */
 	spec_size_invalidate(dev, VBLK);
 	spec_size_invalidate(dev, VCHR);
 #else	/* !illumos */
 	zv->zv_volsize = volsize;
 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		struct g_provider *pp;
 
 		pp = zv->zv_provider;
 		if (pp == NULL)
 			return;
 		g_topology_lock();
 		g_resize_provider(pp, zv->zv_volsize);
 		g_topology_unlock();
 	}
 #endif	/* illumos */
 }
 
 int
 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 {
 	if (volsize == 0)
 		return (SET_ERROR(EINVAL));
 
 	if (volsize % blocksize != 0)
 		return (SET_ERROR(EINVAL));
 
 #ifdef _ILP32
 	if (volsize - 1 > SPEC_MAXOFFSET_T)
 		return (SET_ERROR(EOVERFLOW));
 #endif
 	return (0);
 }
 
 int
 zvol_check_volblocksize(uint64_t volblocksize)
 {
 	if (volblocksize < SPA_MINBLOCKSIZE ||
 	    volblocksize > SPA_OLD_MAXBLOCKSIZE ||
 	    !ISP2(volblocksize))
 		return (SET_ERROR(EDOM));
 
 	return (0);
 }
 
 int
 zvol_get_stats(objset_t *os, nvlist_t *nv)
 {
 	int error;
 	dmu_object_info_t doi;
 	uint64_t val;
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 	if (error)
 		return (error);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 
 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
 
 	if (error == 0) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 		    doi.doi_data_block_size);
 	}
 
 	return (error);
 }
 
 static zvol_state_t *
 zvol_minor_lookup(const char *name)
 {
 #ifdef illumos
 	minor_t minor;
 #endif
 	zvol_state_t *zv;
 
 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
 #ifdef illumos
 	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 		if (zv == NULL)
 			continue;
 #else
 	LIST_FOREACH(zv, &all_zvols, zv_links) {
 #endif
 		if (strcmp(zv->zv_name, name) == 0)
 			return (zv);
 	}
 
 	return (NULL);
 }
 
 /* extent mapping arg */
 struct maparg {
 	zvol_state_t	*ma_zv;
 	uint64_t	ma_blks;
 };
 
 /*ARGSUSED*/
 static int
 zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct maparg *ma = arg;
 	zvol_extent_t *ze;
 	int bs = ma->ma_zv->zv_volblocksize;
 
 	if (bp == NULL || BP_IS_HOLE(bp) ||
 	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 		return (0);
 
 	VERIFY(!BP_IS_EMBEDDED(bp));
 
 	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
 	ma->ma_blks++;
 
 	/* Abort immediately if we have encountered gang blocks */
 	if (BP_IS_GANG(bp))
 		return (SET_ERROR(EFRAGS));
 
 	/*
 	 * See if the block is at the end of the previous extent.
 	 */
 	ze = list_tail(&ma->ma_zv->zv_extents);
 	if (ze &&
 	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
 	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
 	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
 		ze->ze_nblks++;
 		return (0);
 	}
 
 	dprintf_bp(bp, "%s", "next blkptr:");
 
 	/* start a new extent */
 	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
 	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
 	ze->ze_nblks = 1;
 	list_insert_tail(&ma->ma_zv->zv_extents, ze);
 	return (0);
 }
 
 static void
 zvol_free_extents(zvol_state_t *zv)
 {
 	zvol_extent_t *ze;
 
 	while (ze = list_head(&zv->zv_extents)) {
 		list_remove(&zv->zv_extents, ze);
 		kmem_free(ze, sizeof (zvol_extent_t));
 	}
 }
 
 static int
 zvol_get_lbas(zvol_state_t *zv)
 {
 	objset_t *os = zv->zv_objset;
 	struct maparg	ma;
 	int		err;
 
 	ma.ma_zv = zv;
 	ma.ma_blks = 0;
 	zvol_free_extents(zv);
 
 	/* commit any in-flight changes before traversing the dataset */
 	txg_wait_synced(dmu_objset_pool(os), 0);
 	err = traverse_dataset(dmu_objset_ds(os), 0,
 	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
 	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
 		zvol_free_extents(zv);
 		return (err ? err : EIO);
 	}
 
 	return (0);
 }
 
 /* ARGSUSED */
 void
 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	zfs_creat_t *zct = arg;
 	nvlist_t *nvprops = zct->zct_props;
 	int error;
 	uint64_t volblocksize, volsize;
 
 	VERIFY(nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 	if (nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 
 	/*
 	 * These properties must be removed from the list so the generic
 	 * property setting step won't apply to them.
 	 */
 	VERIFY(nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 	(void) nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 
 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 	ASSERT(error == 0);
 }
 
 /*
  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
  * implement DKIOCFREE/free-long-range.
  */
 static int
 zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
 {
 	uint64_t offset, length;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
 }
 
 /*
  * Replay a TX_WRITE ZIL transaction that didn't get committed
  * after a system failure
  */
 static int
 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
 {
 	objset_t *os = zv->zv_objset;
 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
 	uint64_t offset, length;
 	dmu_tx_t *tx;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 		if (length < blocksize) {
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
 	}
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
 		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 		dmu_tx_commit(tx);
 	}
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
 {
 	return (SET_ERROR(ENOTSUP));
 }
 
 /*
  * Callback vectors for replaying records.
  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  */
 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* 0 no such transaction type */
 	zvol_replay_err,	/* TX_CREATE */
 	zvol_replay_err,	/* TX_MKDIR */
 	zvol_replay_err,	/* TX_MKXATTR */
 	zvol_replay_err,	/* TX_SYMLINK */
 	zvol_replay_err,	/* TX_REMOVE */
 	zvol_replay_err,	/* TX_RMDIR */
 	zvol_replay_err,	/* TX_LINK */
 	zvol_replay_err,	/* TX_RENAME */
 	zvol_replay_write,	/* TX_WRITE */
 	zvol_replay_truncate,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
 	zvol_replay_err,	/* TX_ACL */
 	zvol_replay_err,	/* TX_CREATE_ACL */
 	zvol_replay_err,	/* TX_CREATE_ATTR */
 	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL */
 	zvol_replay_err,	/* TX_MKDIR_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
 	zvol_replay_err,	/* TX_WRITE2 */
 };
 
 #ifdef illumos
 int
 zvol_name2minor(const char *name, minor_t *minor)
 {
 	zvol_state_t *zv;
 
 	mutex_enter(&zfsdev_state_lock);
 	zv = zvol_minor_lookup(name);
 	if (minor && zv)
 		*minor = zv->zv_minor;
 	mutex_exit(&zfsdev_state_lock);
 	return (zv ? 0 : -1);
 }
 #endif	/* illumos */
 
 /*
  * Create a minor node (plus a whole lot more) for the specified volume.
  */
 int
 zvol_create_minor(const char *name)
 {
 	zfs_soft_state_t *zs;
 	zvol_state_t *zv;
 	objset_t *os;
 	dmu_object_info_t doi;
 #ifdef illumos
 	minor_t minor = 0;
 	char chrbuf[30], blkbuf[30];
 #else
 	struct cdev *dev;
 	struct g_provider *pp;
 	struct g_geom *gp;
 	uint64_t volsize, mode;
 #endif
 	int error;
 
 #ifndef illumos
 	ZFS_LOG(1, "Creating ZVOL %s...", name);
 #endif
 
 	mutex_enter(&zfsdev_state_lock);
 
 	if (zvol_minor_lookup(name) != NULL) {
 		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/* lie and say we're read-only */
 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
 
 	if (error) {
 		mutex_exit(&zfsdev_state_lock);
 		return (error);
 	}
 
 #ifdef illumos
 	if ((minor = zfsdev_minor_alloc()) == 0) {
 		dmu_objset_disown(os, FTAG);
 		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
 		dmu_objset_disown(os, FTAG);
 		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(EAGAIN));
 	}
 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
 	    (char *)name);
 
 	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
 
 	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 		ddi_soft_state_free(zfsdev_state, minor);
 		dmu_objset_disown(os, FTAG);
 		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(EAGAIN));
 	}
 
 	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
 
 	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 		ddi_remove_minor_node(zfs_dip, chrbuf);
 		ddi_soft_state_free(zfsdev_state, minor);
 		dmu_objset_disown(os, FTAG);
 		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(EAGAIN));
 	}
 
 	zs = ddi_get_soft_state(zfsdev_state, minor);
 	zs->zss_type = ZSST_ZVOL;
 	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 #else	/* !illumos */
 
 	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
 	zv->zv_state = 0;
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error) {
 		kmem_free(zv, sizeof(*zv));
 		dmu_objset_disown(os, zvol_tag);
 		mutex_exit(&zfsdev_state_lock);
 		return (error);
 	}
 	error = dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
 	if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
 		mode = volmode;
 
 	DROP_GIANT();
 	zv->zv_volsize = volsize;
 	zv->zv_volmode = mode;
 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		g_topology_lock();
 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
 		gp->start = zvol_geom_start;
 		gp->access = zvol_geom_access;
 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
 		pp->sectorsize = DEV_BSIZE;
 		pp->mediasize = zv->zv_volsize;
 		pp->private = zv;
 
 		zv->zv_provider = pp;
 		bioq_init(&zv->zv_queue);
 		mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 		if (make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
 		    &dev, &zvol_cdevsw, NULL, UID_ROOT, GID_OPERATOR,
 		    0640, "%s/%s", ZVOL_DRIVER, name) != 0) {
 			kmem_free(zv, sizeof(*zv));
 			dmu_objset_disown(os, FTAG);
 			mutex_exit(&zfsdev_state_lock);
 			return (SET_ERROR(ENXIO));
 		}
 		zv->zv_dev = dev;
 		dev->si_iosize_max = MAXPHYS;
 		dev->si_drv2 = zv;
 	}
 	LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
 #endif	/* illumos */
 
 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 	zv->zv_min_bs = DEV_BSHIFT;
 #ifdef illumos
 	zv->zv_minor = minor;
 #endif
 	zv->zv_objset = os;
 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 		zv->zv_flags |= ZVOL_RDONLY;
 	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
 	    offsetof(zvol_extent_t, ze_node));
 	/* get and cache the blocksize */
 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
 	ASSERT(error == 0);
 	zv->zv_volblocksize = doi.doi_data_block_size;
 
 	if (spa_writeable(dmu_objset_spa(os))) {
 		if (zil_replay_disable)
 			zil_destroy(dmu_objset_zil(os), B_FALSE);
 		else
 			zil_replay(os, zv, zvol_replay_vector);
 	}
 	dmu_objset_disown(os, FTAG);
 	zv->zv_objset = NULL;
 
 	zvol_minors++;
 
 	mutex_exit(&zfsdev_state_lock);
 #ifndef illumos
 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		zvol_geom_run(zv);
 		g_topology_unlock();
 	}
 	PICKUP_GIANT();
 
 	ZFS_LOG(1, "ZVOL %s created.", name);
 #endif
 
 	return (0);
 }
 
 /*
  * Remove minor node for the specified volume.
  */
 static int
 zvol_remove_zv(zvol_state_t *zv)
 {
 #ifdef illumos
 	char nmbuf[20];
 	minor_t minor = zv->zv_minor;
 #endif
 
 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 	if (zv->zv_total_opens != 0)
 		return (SET_ERROR(EBUSY));
 
 #ifdef illumos
 	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
 	ddi_remove_minor_node(zfs_dip, nmbuf);
 
 	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
 	ddi_remove_minor_node(zfs_dip, nmbuf);
 #else
 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
 
 	LIST_REMOVE(zv, zv_links);
 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		g_topology_lock();
 		zvol_geom_destroy(zv);
 		g_topology_unlock();
 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV)
 		destroy_dev(zv->zv_dev);
 #endif
 
 	avl_destroy(&zv->zv_znode.z_range_avl);
 	mutex_destroy(&zv->zv_znode.z_range_lock);
 
 	kmem_free(zv, sizeof (zvol_state_t));
 #ifdef illumos
 	ddi_soft_state_free(zfsdev_state, minor);
 #endif
 	zvol_minors--;
 	return (0);
 }
 
 int
 zvol_remove_minor(const char *name)
 {
 	zvol_state_t *zv;
 	int rc;
 
 	mutex_enter(&zfsdev_state_lock);
 	if ((zv = zvol_minor_lookup(name)) == NULL) {
 		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(ENXIO));
 	}
 	rc = zvol_remove_zv(zv);
 	mutex_exit(&zfsdev_state_lock);
 	return (rc);
 }
 
 int
 zvol_first_open(zvol_state_t *zv)
 {
 	objset_t *os;
 	uint64_t volsize;
 	int error;
 	uint64_t readonly;
 
 	/* lie and say we're read-only */
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
 	    zvol_tag, &os);
 	if (error)
 		return (error);
 
 	zv->zv_objset = os;
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error) {
 		ASSERT(error == 0);
 		dmu_objset_disown(os, zvol_tag);
 		return (error);
 	}
 
 	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
 	if (error) {
 		dmu_objset_disown(os, zvol_tag);
 		return (error);
 	}
 
 	zvol_size_changed(zv, volsize);
 	zv->zv_zilog = zil_open(os, zvol_get_data);
 
 	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
 	    NULL) == 0);
 	if (readonly || dmu_objset_is_snapshot(os) ||
 	    !spa_writeable(dmu_objset_spa(os)))
 		zv->zv_flags |= ZVOL_RDONLY;
 	else
 		zv->zv_flags &= ~ZVOL_RDONLY;
 	return (error);
 }
 
 void
 zvol_last_close(zvol_state_t *zv)
 {
 	zil_close(zv->zv_zilog);
 	zv->zv_zilog = NULL;
 
 	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
 	zv->zv_dbuf = NULL;
 
 	/*
 	 * Evict cached data
 	 */
 	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
 	    !(zv->zv_flags & ZVOL_RDONLY))
 		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 	dmu_objset_evict_dbufs(zv->zv_objset);
 
 	dmu_objset_disown(zv->zv_objset, zvol_tag);
 	zv->zv_objset = NULL;
 }
 
 #ifdef illumos
 int
 zvol_prealloc(zvol_state_t *zv)
 {
 	objset_t *os = zv->zv_objset;
 	dmu_tx_t *tx;
 	uint64_t refd, avail, usedobjs, availobjs;
 	uint64_t resid = zv->zv_volsize;
 	uint64_t off = 0;
 
 	/* Check the space usage before attempting to allocate the space */
 	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
 	if (avail < zv->zv_volsize)
 		return (SET_ERROR(ENOSPC));
 
 	/* Free old extents if they exist */
 	zvol_free_extents(zv);
 
 	while (resid != 0) {
 		int error;
 		uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
 			return (error);
 		}
 		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
 		dmu_tx_commit(tx);
 		off += bytes;
 		resid -= bytes;
 	}
 	txg_wait_synced(dmu_objset_pool(os), 0);
 
 	return (0);
 }
 #endif	/* illumos */
 
 static int
 zvol_update_volsize(objset_t *os, uint64_t volsize)
 {
 	dmu_tx_t *tx;
 	int error;
 
 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 	    &volsize, tx);
 	dmu_tx_commit(tx);
 
 	if (error == 0)
 		error = dmu_free_long_range(os,
 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
 	return (error);
 }
 
 void
 zvol_remove_minors(const char *name)
 {
 #ifdef illumos
 	zvol_state_t *zv;
 	char *namebuf;
 	minor_t minor;
 
 	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
 	(void) strncpy(namebuf, name, strlen(name));
 	(void) strcat(namebuf, "/");
 	mutex_enter(&zfsdev_state_lock);
 	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 
 		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 		if (zv == NULL)
 			continue;
 		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
 			(void) zvol_remove_zv(zv);
 	}
 	kmem_free(namebuf, strlen(name) + 2);
 
 	mutex_exit(&zfsdev_state_lock);
 #else	/* !illumos */
 	zvol_state_t *zv, *tzv;
 	size_t namelen;
 
 	namelen = strlen(name);
 
 	DROP_GIANT();
 	mutex_enter(&zfsdev_state_lock);
 
 	LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
 		if (strcmp(zv->zv_name, name) == 0 ||
 		    (strncmp(zv->zv_name, name, namelen) == 0 &&
 		    strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
 		    zv->zv_name[namelen] == '@'))) {
 			(void) zvol_remove_zv(zv);
 		}
 	}
 
 	mutex_exit(&zfsdev_state_lock);
 	PICKUP_GIANT();
 #endif	/* illumos */
 }
 
 static int
 zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
 {
 	uint64_t old_volsize = 0ULL;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
 	/*
 	 * Reinitialize the dump area to the new size. If we
 	 * failed to resize the dump area then restore it back to
 	 * its original size.  We must set the new volsize prior
 	 * to calling dumpvp_resize() to ensure that the devices'
 	 * size(9P) is not visible by the dump subsystem.
 	 */
 	old_volsize = zv->zv_volsize;
 	zvol_size_changed(zv, volsize);
 
 #ifdef ZVOL_DUMP
 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
 		if ((error = zvol_dumpify(zv)) != 0 ||
 		    (error = dumpvp_resize()) != 0) {
 			int dumpify_error;
 
 			(void) zvol_update_volsize(zv->zv_objset, old_volsize);
 			zvol_size_changed(zv, old_volsize);
 			dumpify_error = zvol_dumpify(zv);
 			error = dumpify_error ? dumpify_error : error;
 		}
 	}
 #endif	/* ZVOL_DUMP */
 
 #ifdef illumos
 	/*
 	 * Generate a LUN expansion event.
 	 */
 	if (error == 0) {
 		sysevent_id_t eid;
 		nvlist_t *attr;
 		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 
 		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
 		    zv->zv_minor);
 
 		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 
 		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
 
 		nvlist_free(attr);
 		kmem_free(physpath, MAXPATHLEN);
 	}
 #endif	/* illumos */
 	return (error);
 }
 
 int
 zvol_set_volsize(const char *name, uint64_t volsize)
 {
 	zvol_state_t *zv = NULL;
 	objset_t *os;
 	int error;
 	dmu_object_info_t doi;
 	uint64_t readonly;
 	boolean_t owned = B_FALSE;
 
 	error = dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 	if (error != 0)
 		return (error);
 	if (readonly)
 		return (SET_ERROR(EROFS));
 
 	mutex_enter(&zfsdev_state_lock);
 	zv = zvol_minor_lookup(name);
 
 	if (zv == NULL || zv->zv_objset == NULL) {
 		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
 		    FTAG, &os)) != 0) {
 			mutex_exit(&zfsdev_state_lock);
 			return (error);
 		}
 		owned = B_TRUE;
 		if (zv != NULL)
 			zv->zv_objset = os;
 	} else {
 		os = zv->zv_objset;
 	}
 
 	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
 	    (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
 		goto out;
 
 	error = zvol_update_volsize(os, volsize);
 
 	if (error == 0 && zv != NULL)
 		error = zvol_update_live_volsize(zv, volsize);
 out:
 	if (owned) {
 		dmu_objset_disown(os, FTAG);
 		if (zv != NULL)
 			zv->zv_objset = NULL;
 	}
 	mutex_exit(&zfsdev_state_lock);
 	return (error);
 }
 
 /*ARGSUSED*/
 #ifdef illumos
 int
 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 #else
 static int
 zvol_open(struct g_provider *pp, int flag, int count)
 #endif
 {
 	zvol_state_t *zv;
 	int err = 0;
 #ifdef illumos
 
 	mutex_enter(&zfsdev_state_lock);
 
 	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
 	if (zv == NULL) {
 		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (zv->zv_total_opens == 0)
 		err = zvol_first_open(zv);
 	if (err) {
 		mutex_exit(&zfsdev_state_lock);
 		return (err);
 	}
 #else	/* !illumos */
 	boolean_t locked = B_FALSE;
 
 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
 		/*
 		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
 		 * attempting to probe geom providers while looking for a
 		 * replacement for a missing VDEV.  In this case, the
 		 * spa_namespace_lock will not be held, but it is still illegal
 		 * to use a zvol as a vdev.  Deadlocks can result if another
 		 * thread has spa_namespace_lock
 		 */
 		return (EOPNOTSUPP);
 	}
 	/*
 	 * Protect against recursively entering spa_namespace_lock
 	 * when spa_open() is used for a pool on a (local) ZVOL(s).
 	 * This is needed since we replaced upstream zfsdev_state_lock
 	 * with spa_namespace_lock in the ZVOL code.
 	 * We are using the same trick as spa_open().
 	 * Note that calls in zvol_first_open which need to resolve
 	 * pool name to a spa object will enter spa_open()
 	 * recursively, but that function already has all the
 	 * necessary protection.
 	 */
 	if (!MUTEX_HELD(&zfsdev_state_lock)) {
 		mutex_enter(&zfsdev_state_lock);
 		locked = B_TRUE;
 	}
 
 	zv = pp->private;
 	if (zv == NULL) {
 		if (locked)
 			mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (zv->zv_total_opens == 0) {
 		err = zvol_first_open(zv);
 		if (err) {
 			if (locked)
 				mutex_exit(&zfsdev_state_lock);
 			return (err);
 		}
 		pp->mediasize = zv->zv_volsize;
 		pp->stripeoffset = 0;
 		pp->stripesize = zv->zv_volblocksize;
 	}
 #endif	/* illumos */
 	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 		err = SET_ERROR(EROFS);
 		goto out;
 	}
 	if (zv->zv_flags & ZVOL_EXCL) {
 		err = SET_ERROR(EBUSY);
 		goto out;
 	}
 #ifdef FEXCL
 	if (flag & FEXCL) {
 		if (zv->zv_total_opens != 0) {
 			err = SET_ERROR(EBUSY);
 			goto out;
 		}
 		zv->zv_flags |= ZVOL_EXCL;
 	}
 #endif
 
 #ifdef illumos
 	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
 		zv->zv_open_count[otyp]++;
 		zv->zv_total_opens++;
 	}
 	mutex_exit(&zfsdev_state_lock);
 #else
 	zv->zv_total_opens += count;
 	if (locked)
 		mutex_exit(&zfsdev_state_lock);
 #endif
 
 	return (err);
 out:
 	if (zv->zv_total_opens == 0)
 		zvol_last_close(zv);
 #ifdef illumos
 	mutex_exit(&zfsdev_state_lock);
 #else
 	if (locked)
 		mutex_exit(&zfsdev_state_lock);
 #endif
 	return (err);
 }
 
 /*ARGSUSED*/
 #ifdef illumos
 int
 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
 {
 	minor_t minor = getminor(dev);
 	zvol_state_t *zv;
 	int error = 0;
 
 	mutex_enter(&zfsdev_state_lock);
 
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL) {
 		mutex_exit(&zfsdev_state_lock);
 #else	/* !illumos */
 static int
 zvol_close(struct g_provider *pp, int flag, int count)
 {
 	zvol_state_t *zv;
 	int error = 0;
 	boolean_t locked = B_FALSE;
 
 	/* See comment in zvol_open(). */
 	if (!MUTEX_HELD(&zfsdev_state_lock)) {
 		mutex_enter(&zfsdev_state_lock);
 		locked = B_TRUE;
 	}
 
 	zv = pp->private;
 	if (zv == NULL) {
 		if (locked)
 			mutex_exit(&zfsdev_state_lock);
 #endif	/* illumos */
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (zv->zv_flags & ZVOL_EXCL) {
 		ASSERT(zv->zv_total_opens == 1);
 		zv->zv_flags &= ~ZVOL_EXCL;
 	}
 
 	/*
 	 * If the open count is zero, this is a spurious close.
 	 * That indicates a bug in the kernel / DDI framework.
 	 */
 #ifdef illumos
 	ASSERT(zv->zv_open_count[otyp] != 0);
 #endif
 	ASSERT(zv->zv_total_opens != 0);
 
 	/*
 	 * You may get multiple opens, but only one close.
 	 */
 #ifdef illumos
 	zv->zv_open_count[otyp]--;
 	zv->zv_total_opens--;
 #else
 	zv->zv_total_opens -= count;
 #endif
 
 	if (zv->zv_total_opens == 0)
 		zvol_last_close(zv);
 
 #ifdef illumos
 	mutex_exit(&zfsdev_state_lock);
 #else
 	if (locked)
 		mutex_exit(&zfsdev_state_lock);
 #endif
 	return (error);
 }
 
 static void
 zvol_get_done(zgd_t *zgd, int error)
 {
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_range_unlock(zgd->zgd_rl);
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 static int
 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	zvol_state_t *zv = arg;
 	objset_t *os = zv->zv_objset;
 	uint64_t object = ZVOL_OBJ;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;	/* length of user data */
 	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error;
 
 	ASSERT(zio != NULL);
 	ASSERT(size != 0);
 
 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_zilog = zv->zv_zilog;
 	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) {	/* immediate write */
 		error = dmu_read(os, object, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 	} else {
 		size = zv->zv_volblocksize;
 		offset = P2ALIGN(offset, size);
 		error = dmu_buf_hold(os, object, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (error == 0) {
 			blkptr_t *obp = dmu_buf_get_blkptr(db);
 			if (obp) {
 				ASSERT(BP_IS_HOLE(bp));
 				*bp = *obp;
 			}
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zvol_get_done, zgd);
 
 			if (error == 0)
 				return (0);
 		}
 	}
 
 	zvol_get_done(zgd, error);
 
 	return (error);
 }
 
 /*
  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
  *
  * We store data in the log buffers if it's small enough.
  * Otherwise we will later flush the data out via dmu_sync().
  */
 ssize_t zvol_immediate_write_sz = 32768;
 
 static void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
     boolean_t sync)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
 	boolean_t slogging;
 	ssize_t immediate_write_sz;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 	    ? 0 : zvol_immediate_write_sz;
 
 	slogging = spa_has_slogs(zilog->zl_spa) &&
 	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
 
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
 		ssize_t len;
 		itx_wr_state_t write_state;
 
 		/*
 		 * Unlike zfs_log_write() we can be called with
 		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
 		 */
 		if (blocksize > immediate_write_sz && !slogging &&
 		    resid >= blocksize && off % blocksize == 0) {
 			write_state = WR_INDIRECT; /* uses dmu_sync */
 			len = blocksize;
 		} else if (sync) {
 			write_state = WR_COPIED;
 			len = MIN(ZIL_MAX_LOG_DATA, resid);
 		} else {
 			write_state = WR_NEED_COPY;
 			len = MIN(ZIL_MAX_LOG_DATA, resid);
 		}
 
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
 		    (write_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
 		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			write_state = WR_NEED_COPY;
 		}
 
 		itx->itx_wr_state = write_state;
 		if (write_state == WR_NEED_COPY)
 			itx->itx_sod += len;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
 		lr->lr_length = len;
 		lr->lr_blkoff = 0;
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zv;
 		itx->itx_sync = sync;
 
 		zil_itx_assign(zilog, itx, tx);
 
 		off += len;
 		resid -= len;
 	}
 }
 
 #ifdef illumos
 static int
 zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
     uint64_t size, boolean_t doread, boolean_t isdump)
 {
 	vdev_disk_t *dvd;
 	int c;
 	int numerrors = 0;
 
 	if (vd->vdev_ops == &vdev_mirror_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops ||
 	    vd->vdev_ops == &vdev_spare_ops) {
 		for (c = 0; c < vd->vdev_children; c++) {
 			int err = zvol_dumpio_vdev(vd->vdev_child[c],
 			    addr, offset, origoffset, size, doread, isdump);
 			if (err != 0) {
 				numerrors++;
 			} else if (doread) {
 				break;
 			}
 		}
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
 		return (numerrors < vd->vdev_children ? 0 : EIO);
 
 	if (doread && !vdev_readable(vd))
 		return (SET_ERROR(EIO));
 	else if (!doread && !vdev_writeable(vd))
 		return (SET_ERROR(EIO));
 
 	if (vd->vdev_ops == &vdev_raidz_ops) {
 		return (vdev_raidz_physio(vd,
 		    addr, size, offset, origoffset, doread, isdump));
 	}
 
 	offset += VDEV_LABEL_START_SIZE;
 
 	if (ddi_in_panic() || isdump) {
 		ASSERT(!doread);
 		if (doread)
 			return (SET_ERROR(EIO));
 		dvd = vd->vdev_tsd;
 		ASSERT3P(dvd, !=, NULL);
 		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
 		    lbtodb(size)));
 	} else {
 		dvd = vd->vdev_tsd;
 		ASSERT3P(dvd, !=, NULL);
 		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
 		    offset, doread ? B_READ : B_WRITE));
 	}
 }
 
 static int
 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
     boolean_t doread, boolean_t isdump)
 {
 	vdev_t *vd;
 	int error;
 	zvol_extent_t *ze;
 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
 
 	/* Must be sector aligned, and not stradle a block boundary. */
 	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
 	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
 		return (SET_ERROR(EINVAL));
 	}
 	ASSERT(size <= zv->zv_volblocksize);
 
 	/* Locate the extent this belongs to */
 	ze = list_head(&zv->zv_extents);
 	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
 		offset -= ze->ze_nblks * zv->zv_volblocksize;
 		ze = list_next(&zv->zv_extents, ze);
 	}
 
 	if (ze == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (!ddi_in_panic())
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
 	offset += DVA_GET_OFFSET(&ze->ze_dva);
 	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
 	    size, doread, isdump);
 
 	if (!ddi_in_panic())
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 	return (error);
 }
 
 int
 zvol_strategy(buf_t *bp)
 {
 	zfs_soft_state_t *zs = NULL;
 #else	/* !illumos */
 void
 zvol_strategy(struct bio *bp)
 {
 #endif	/* illumos */
 	zvol_state_t *zv;
 	uint64_t off, volsize;
 	size_t resid;
 	char *addr;
 	objset_t *os;
 	rl_t *rl;
 	int error = 0;
 #ifdef illumos
 	boolean_t doread = bp->b_flags & B_READ;
 #else
 	boolean_t doread = 0;
 #endif
 	boolean_t is_dumpified;
 	boolean_t sync;
 
 #ifdef illumos
 	if (getminor(bp->b_edev) == 0) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
 		if (zs == NULL)
 			error = SET_ERROR(ENXIO);
 		else if (zs->zss_type != ZSST_ZVOL)
 			error = SET_ERROR(EINVAL);
 	}
 
 	if (error) {
 		bioerror(bp, error);
 		biodone(bp);
 		return (0);
 	}
 
 	zv = zs->zss_data;
 
 	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
 		bioerror(bp, EROFS);
 		biodone(bp);
 		return (0);
 	}
 
 	off = ldbtob(bp->b_blkno);
 #else	/* !illumos */
 	if (bp->bio_to)
 		zv = bp->bio_to->private;
 	else
 		zv = bp->bio_dev->si_drv2;
 
 	if (zv == NULL) {
 		error = SET_ERROR(ENXIO);
 		goto out;
 	}
 
 	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
 		error = SET_ERROR(EROFS);
 		goto out;
 	}
 
 	switch (bp->bio_cmd) {
 	case BIO_FLUSH:
 		goto sync;
 	case BIO_READ:
 		doread = 1;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		goto out;
 	}
 
 	off = bp->bio_offset;
 #endif	/* illumos */
 	volsize = zv->zv_volsize;
 
 	os = zv->zv_objset;
 	ASSERT(os != NULL);
 
 #ifdef illumos
 	bp_mapin(bp);
 	addr = bp->b_un.b_addr;
 	resid = bp->b_bcount;
 
 	if (resid > 0 && (off < 0 || off >= volsize)) {
 		bioerror(bp, EIO);
 		biodone(bp);
 		return (0);
 	}
 
 	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
 	sync = ((!(bp->b_flags & B_ASYNC) &&
 	    !(zv->zv_flags & ZVOL_WCE)) ||
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
 	    !doread && !is_dumpified;
 #else	/* !illumos */
 	addr = bp->bio_data;
 	resid = bp->bio_length;
 
 	if (resid > 0 && (off < 0 || off >= volsize)) {
 		error = SET_ERROR(EIO);
 		goto out;
 	}
 
 	is_dumpified = B_FALSE;
 	sync = !doread && !is_dumpified &&
 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 #endif	/* illumos */
 
 	/*
 	 * There must be no buffer changes when doing a dmu_sync() because
 	 * we can't change the data whilst calculating the checksum.
 	 */
 	rl = zfs_range_lock(&zv->zv_znode, off, resid,
 	    doread ? RL_READER : RL_WRITER);
 
 #ifndef illumos
 	if (bp->bio_cmd == BIO_DELETE) {
 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error != 0) {
 			dmu_tx_abort(tx);
 		} else {
 			zvol_log_truncate(zv, tx, off, resid, B_TRUE);
 			dmu_tx_commit(tx);
 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
 			    off, resid);
 			resid = 0;
 		}
 		goto unlock;
 	}
 #endif
 	while (resid != 0 && off < volsize) {
 		size_t size = MIN(resid, zvol_maxphys);
 #ifdef illumos
 		if (is_dumpified) {
 			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
 			error = zvol_dumpio(zv, addr, off, size,
 			    doread, B_FALSE);
 		} else if (doread) {
 #else
 		if (doread) {
 #endif
 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
 			    DMU_READ_PREFETCH);
 		} else {
 			dmu_tx_t *tx = dmu_tx_create(os);
 			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
 			error = dmu_tx_assign(tx, TXG_WAIT);
 			if (error) {
 				dmu_tx_abort(tx);
 			} else {
 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
 				zvol_log_write(zv, tx, off, size, sync);
 				dmu_tx_commit(tx);
 			}
 		}
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 		off += size;
 		addr += size;
 		resid -= size;
 	}
 #ifndef illumos
 unlock:
 #endif
 	zfs_range_unlock(rl);
 
 #ifdef illumos
 	if ((bp->b_resid = resid) == bp->b_bcount)
 		bioerror(bp, off > volsize ? EINVAL : error);
 
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	biodone(bp);
 
 	return (0);
 #else	/* !illumos */
 	bp->bio_completed = bp->bio_length - resid;
 	if (bp->bio_completed < bp->bio_length && off > volsize)
 		error = EINVAL;
 
 	if (sync) {
 sync:
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	}
 out:
 	if (bp->bio_to)
 		g_io_deliver(bp, error);
 	else
 		biofinish(bp, NULL, error);
 #endif	/* illumos */
 }
 
 #ifdef illumos
 /*
  * Set the buffer count to the zvol maximum transfer.
  * Using our own routine instead of the default minphys()
  * means that for larger writes we write bigger buffers on X86
  * (128K instead of 56K) and flush the disk write cache less often
  * (every zvol_maxphys - currently 1MB) instead of minphys (currently
  * 56K on X86 and 128K on sparc).
  */
 void
 zvol_minphys(struct buf *bp)
 {
 	if (bp->b_bcount > zvol_maxphys)
 		bp->b_bcount = zvol_maxphys;
 }
 
 int
 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
 {
 	minor_t minor = getminor(dev);
 	zvol_state_t *zv;
 	int error = 0;
 	uint64_t size;
 	uint64_t boff;
 	uint64_t resid;
 
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
 
 	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
 		return (SET_ERROR(EINVAL));
 
 	boff = ldbtob(blkno);
 	resid = ldbtob(nblocks);
 
 	VERIFY3U(boff + resid, <=, zv->zv_volsize);
 
 	while (resid) {
 		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
 		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
 		if (error)
 			break;
 		boff += size;
 		addr += size;
 		resid -= size;
 	}
 
 	return (error);
 }
 
 /*ARGSUSED*/
 int
 zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 {
 	minor_t minor = getminor(dev);
 #else	/* !illumos */
 int
 zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 #endif	/* illumos */
 	zvol_state_t *zv;
 	uint64_t volsize;
 	rl_t *rl;
 	int error = 0;
 
 #ifdef illumos
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
 #else
 	zv = dev->si_drv2;
 #endif
 
 	volsize = zv->zv_volsize;
 	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
 	if (uio->uio_resid > 0 &&
 	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
 		return (SET_ERROR(EIO));
 
 #ifdef illumos
 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
 		error = physio(zvol_strategy, NULL, dev, B_READ,
 		    zvol_minphys, uio);
 		return (error);
 	}
 #endif
 
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_READER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
 
 		/* don't read past the end */
 		if (bytes > volsize - uio->uio_loffset)
 			bytes = volsize - uio->uio_loffset;
 
 		error =  dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 	}
 	zfs_range_unlock(rl);
 	return (error);
 }
 
 #ifdef illumos
 /*ARGSUSED*/
 int
 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 {
 	minor_t minor = getminor(dev);
 #else	/* !illumos */
 int
 zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 #endif	/* illumos */
 	zvol_state_t *zv;
 	uint64_t volsize;
 	rl_t *rl;
 	int error = 0;
 	boolean_t sync;
 
 #ifdef illumos
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
 #else
 	zv = dev->si_drv2;
 #endif
 
 	volsize = zv->zv_volsize;
 	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
 	if (uio->uio_resid > 0 &&
 	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
 		return (SET_ERROR(EIO));
 
 #ifdef illumos
 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
 		error = physio(zvol_strategy, NULL, dev, B_WRITE,
 		    zvol_minphys, uio);
 		return (error);
 	}
 
 	sync = !(zv->zv_flags & ZVOL_WCE) ||
 #else
 	sync = (ioflag & IO_SYNC) ||
 #endif
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_WRITER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
 		uint64_t off = uio->uio_loffset;
 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 
 		if (bytes > volsize - off)	/* don't write past the end */
 			bytes = volsize - off;
 
 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			break;
 		}
 		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
 		if (error == 0)
 			zvol_log_write(zv, tx, off, bytes, sync);
 		dmu_tx_commit(tx);
 
 		if (error)
 			break;
 	}
 	zfs_range_unlock(rl);
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	return (error);
 }
 
 #ifdef illumos
 int
 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
 {
 	struct uuid uuid = EFI_RESERVED;
 	efi_gpe_t gpe = { 0 };
 	uint32_t crc;
 	dk_efi_t efi;
 	int length;
 	char *ptr;
 
 	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
 		return (SET_ERROR(EFAULT));
 	ptr = (char *)(uintptr_t)efi.dki_data_64;
 	length = efi.dki_length;
 	/*
 	 * Some clients may attempt to request a PMBR for the
 	 * zvol.  Currently this interface will return EINVAL to
 	 * such requests.  These requests could be supported by
 	 * adding a check for lba == 0 and consing up an appropriate
 	 * PMBR.
 	 */
 	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
 		return (SET_ERROR(EINVAL));
 
 	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
 	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
 	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
 
 	if (efi.dki_lba == 1) {
 		efi_gpt_t gpt = { 0 };
 
 		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
 		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
 		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
 		gpt.efi_gpt_MyLBA = LE_64(1ULL);
 		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
 		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
 		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
 		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
 		gpt.efi_gpt_SizeOfPartitionEntry =
 		    LE_32(sizeof (efi_gpe_t));
 		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
 		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
 		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
 		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
 		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
 		    flag))
 			return (SET_ERROR(EFAULT));
 		ptr += sizeof (gpt);
 		length -= sizeof (gpt);
 	}
 	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
 	    length), flag))
 		return (SET_ERROR(EFAULT));
 	return (0);
 }
 
 /*
  * BEGIN entry points to allow external callers access to the volume.
  */
 /*
  * Return the volume parameters needed for access from an external caller.
  * These values are invariant as long as the volume is held open.
  */
 int
 zvol_get_volume_params(minor_t minor, uint64_t *blksize,
     uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
     void **rl_hdl, void **bonus_hdl)
 {
 	zvol_state_t *zv;
 
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
 	if (zv->zv_flags & ZVOL_DUMPIFIED)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT(blksize && max_xfer_len && minor_hdl &&
 	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
 
 	*blksize = zv->zv_volblocksize;
 	*max_xfer_len = (uint64_t)zvol_maxphys;
 	*minor_hdl = zv;
 	*objset_hdl = zv->zv_objset;
 	*zil_hdl = zv->zv_zilog;
 	*rl_hdl = &zv->zv_znode;
 	*bonus_hdl = zv->zv_dbuf;
 	return (0);
 }
 
 /*
  * Return the current volume size to an external caller.
  * The size can change while the volume is open.
  */
 uint64_t
 zvol_get_volume_size(void *minor_hdl)
 {
 	zvol_state_t *zv = minor_hdl;
 
 	return (zv->zv_volsize);
 }
 
 /*
  * Return the current WCE setting to an external caller.
  * The WCE setting can change while the volume is open.
  */
 int
 zvol_get_volume_wce(void *minor_hdl)
 {
 	zvol_state_t *zv = minor_hdl;
 
 	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
 }
 
 /*
  * Entry point for external callers to zvol_log_write
  */
 void
 zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
     boolean_t sync)
 {
 	zvol_state_t *zv = minor_hdl;
 
 	zvol_log_write(zv, tx, off, resid, sync);
 }
 /*
  * END entry points to allow external callers access to the volume.
  */
 #endif	/* illumos */
 
 /*
  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
  */
 static void
 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
     boolean_t sync)
 {
 	itx_t *itx;
 	lr_truncate_t *lr;
 	zilog_t *zilog = zv->zv_zilog;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = ZVOL_OBJ;
 	lr->lr_offset = off;
 	lr->lr_length = len;
 
 	itx->itx_sync = sync;
 	zil_itx_assign(zilog, itx, tx);
 }
 
 #ifdef illumos
 /*
  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
  * Also a dirtbag dkio ioctl for unmap/free-block functionality.
  */
 /*ARGSUSED*/
 int
 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 {
 	zvol_state_t *zv;
 	struct dk_callback *dkc;
 	int error = 0;
 	rl_t *rl;
 
 	mutex_enter(&zfsdev_state_lock);
 
 	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
 
 	if (zv == NULL) {
 		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(ENXIO));
 	}
 	ASSERT(zv->zv_total_opens > 0);
 
 	switch (cmd) {
 
 	case DKIOCINFO:
 	{
 		struct dk_cinfo dki;
 
 		bzero(&dki, sizeof (dki));
 		(void) strcpy(dki.dki_cname, "zvol");
 		(void) strcpy(dki.dki_dname, "zvol");
 		dki.dki_ctype = DKC_UNKNOWN;
 		dki.dki_unit = getminor(dev);
 		dki.dki_maxtransfer =
 		    1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
 		mutex_exit(&zfsdev_state_lock);
 		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
 			error = SET_ERROR(EFAULT);
 		return (error);
 	}
 
 	case DKIOCGMEDIAINFO:
 	{
 		struct dk_minfo dkm;
 
 		bzero(&dkm, sizeof (dkm));
 		dkm.dki_lbsize = 1U << zv->zv_min_bs;
 		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
 		dkm.dki_media_type = DK_UNKNOWN;
 		mutex_exit(&zfsdev_state_lock);
 		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
 			error = SET_ERROR(EFAULT);
 		return (error);
 	}
 
 	case DKIOCGMEDIAINFOEXT:
 	{
 		struct dk_minfo_ext dkmext;
 
 		bzero(&dkmext, sizeof (dkmext));
 		dkmext.dki_lbsize = 1U << zv->zv_min_bs;
 		dkmext.dki_pbsize = zv->zv_volblocksize;
 		dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
 		dkmext.dki_media_type = DK_UNKNOWN;
 		mutex_exit(&zfsdev_state_lock);
 		if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
 			error = SET_ERROR(EFAULT);
 		return (error);
 	}
 
 	case DKIOCGETEFI:
 	{
 		uint64_t vs = zv->zv_volsize;
 		uint8_t bs = zv->zv_min_bs;
 
 		mutex_exit(&zfsdev_state_lock);
 		error = zvol_getefi((void *)arg, flag, vs, bs);
 		return (error);
 	}
 
 	case DKIOCFLUSHWRITECACHE:
 		dkc = (struct dk_callback *)arg;
 		mutex_exit(&zfsdev_state_lock);
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
 			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
 			error = 0;
 		}
 		return (error);
 
 	case DKIOCGETWCE:
 	{
 		int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
 		if (ddi_copyout(&wce, (void *)arg, sizeof (int),
 		    flag))
 			error = SET_ERROR(EFAULT);
 		break;
 	}
 	case DKIOCSETWCE:
 	{
 		int wce;
 		if (ddi_copyin((void *)arg, &wce, sizeof (int),
 		    flag)) {
 			error = SET_ERROR(EFAULT);
 			break;
 		}
 		if (wce) {
 			zv->zv_flags |= ZVOL_WCE;
 			mutex_exit(&zfsdev_state_lock);
 		} else {
 			zv->zv_flags &= ~ZVOL_WCE;
 			mutex_exit(&zfsdev_state_lock);
 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		}
 		return (0);
 	}
 
 	case DKIOCGGEOM:
 	case DKIOCGVTOC:
 		/*
 		 * commands using these (like prtvtoc) expect ENOTSUP
 		 * since we're emulating an EFI label
 		 */
 		error = SET_ERROR(ENOTSUP);
 		break;
 
 	case DKIOCDUMPINIT:
 		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
 		    RL_WRITER);
 		error = zvol_dumpify(zv);
 		zfs_range_unlock(rl);
 		break;
 
 	case DKIOCDUMPFINI:
 		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
 			break;
 		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
 		    RL_WRITER);
 		error = zvol_dump_fini(zv);
 		zfs_range_unlock(rl);
 		break;
 
 	case DKIOCFREE:
 	{
 		dkioc_free_t df;
 		dmu_tx_t *tx;
 
 		if (!zvol_unmap_enabled)
 			break;
 
 		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
 			error = SET_ERROR(EFAULT);
 			break;
 		}
 
 		/*
 		 * Apply Postel's Law to length-checking.  If they overshoot,
 		 * just blank out until the end, if there's a need to blank
 		 * out anything.
 		 */
 		if (df.df_start >= zv->zv_volsize)
 			break;	/* No need to do anything... */
 
 		mutex_exit(&zfsdev_state_lock);
 
 		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
 		    RL_WRITER);
 		tx = dmu_tx_create(zv->zv_objset);
 		dmu_tx_mark_netfree(tx);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error != 0) {
 			dmu_tx_abort(tx);
 		} else {
 			zvol_log_truncate(zv, tx, df.df_start,
 			    df.df_length, B_TRUE);
 			dmu_tx_commit(tx);
 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
 			    df.df_start, df.df_length);
 		}
 
 		zfs_range_unlock(rl);
 
 		if (error == 0) {
 			/*
 			 * If the write-cache is disabled or 'sync' property
 			 * is set to 'always' then treat this as a synchronous
 			 * operation (i.e. commit to zil).
 			 */
 			if (!(zv->zv_flags & ZVOL_WCE) ||
 			    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
 				zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 			/*
 			 * If the caller really wants synchronous writes, and
 			 * can't wait for them, don't return until the write
 			 * is done.
 			 */
 			if (df.df_flags & DF_WAIT_SYNC) {
 				txg_wait_synced(
 				    dmu_objset_pool(zv->zv_objset), 0);
 			}
 		}
 		return (error);
 	}
 
 	default:
 		error = SET_ERROR(ENOTTY);
 		break;
 
 	}
 	mutex_exit(&zfsdev_state_lock);
 	return (error);
 }
 #endif	/* illumos */
 
 int
 zvol_busy(void)
 {
 	return (zvol_minors != 0);
 }
 
 void
 zvol_init(void)
 {
 	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
 	    1) == 0);
 #ifdef illumos
 	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
 #else
 	ZFS_LOG(1, "ZVOL Initialized.");
 #endif
 }
 
 void
 zvol_fini(void)
 {
 #ifdef illumos
 	mutex_destroy(&zfsdev_state_lock);
 #endif
 	ddi_soft_state_fini(&zfsdev_state);
 	ZFS_LOG(1, "ZVOL Deinitialized.");
 }
 
 #ifdef illumos
 /*ARGSUSED*/
 static int
 zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
 		return (1);
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
 }
 
 static int
 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 {
 	dmu_tx_t *tx;
 	int error;
 	objset_t *os = zv->zv_objset;
 	spa_t *spa = dmu_objset_spa(os);
 	vdev_t *vd = spa->spa_root_vdev;
 	nvlist_t *nv = NULL;
 	uint64_t version = spa_version(spa);
 	uint64_t checksum, compress, refresrv, vbs, dedup;
 
 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 	ASSERT(vd->vdev_ops == &vdev_root_ops);
 
 	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
 	    DMU_OBJECT_END);
 	if (error != 0)
 		return (error);
 	/* wait for dmu_free_long_range to actually free the blocks */
 	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 
 	/*
 	 * If the pool on which the dump device is being initialized has more
 	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
 	 * enabled.  If so, bump that feature's counter to indicate that the
 	 * feature is active. We also check the vdev type to handle the
 	 * following case:
 	 *   # zpool create test raidz disk1 disk2 disk3
 	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
 	 *   the raidz vdev itself has 3 children.
 	 */
 	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
 		if (!spa_feature_is_enabled(spa,
 		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
 			return (SET_ERROR(ENOTSUP));
 		(void) dsl_sync_task(spa_name(spa),
 		    zfs_mvdev_dump_feature_check,
 		    zfs_mvdev_dump_activate_feature_sync, NULL,
 		    2, ZFS_SPACE_CHECK_RESERVED);
 	}
 
 	if (!resize) {
 		error = dsl_prop_get_integer(zv->zv_name,
 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
 		if (error == 0) {
 			error = dsl_prop_get_integer(zv->zv_name,
 			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
 			    NULL);
 		}
 		if (error == 0) {
 			error = dsl_prop_get_integer(zv->zv_name,
 			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 			    &refresrv, NULL);
 		}
 		if (error == 0) {
 			error = dsl_prop_get_integer(zv->zv_name,
 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
 			    NULL);
 		}
 		if (version >= SPA_VERSION_DEDUP && error == 0) {
 			error = dsl_prop_get_integer(zv->zv_name,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
 		}
 	}
 	if (error != 0)
 		return (error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	/*
 	 * If we are resizing the dump device then we only need to
 	 * update the refreservation to match the newly updated
 	 * zvolsize. Otherwise, we save off the original state of the
 	 * zvol so that we can restore them if the zvol is ever undumpified.
 	 */
 	if (resize) {
 		error = zap_update(os, ZVOL_ZAP_OBJ,
 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
 		    &zv->zv_volsize, tx);
 	} else {
 		error = zap_update(os, ZVOL_ZAP_OBJ,
 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
 		    &compress, tx);
 		if (error == 0) {
 			error = zap_update(os, ZVOL_ZAP_OBJ,
 			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
 			    &checksum, tx);
 		}
 		if (error == 0) {
 			error = zap_update(os, ZVOL_ZAP_OBJ,
 			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
 			    &refresrv, tx);
 		}
 		if (error == 0) {
 			error = zap_update(os, ZVOL_ZAP_OBJ,
 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
 			    &vbs, tx);
 		}
 		if (error == 0) {
 			error = dmu_object_set_blocksize(
 			    os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
 		}
 		if (version >= SPA_VERSION_DEDUP && error == 0) {
 			error = zap_update(os, ZVOL_ZAP_OBJ,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
 			    &dedup, tx);
 		}
 		if (error == 0)
 			zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
 	}
 	dmu_tx_commit(tx);
 
 	/*
 	 * We only need update the zvol's property if we are initializing
 	 * the dump area for the first time.
 	 */
 	if (error == 0 && !resize) {
 		/*
 		 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
 		 * function.  Otherwise, use the old default -- OFF.
 		 */
 		checksum = spa_feature_is_active(spa,
 		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
 		    ZIO_CHECKSUM_OFF;
 
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 		    ZIO_COMPRESS_OFF) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 		    checksum) == 0);
 		if (version >= SPA_VERSION_DEDUP) {
 			VERIFY(nvlist_add_uint64(nv,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP),
 			    ZIO_CHECKSUM_OFF) == 0);
 		}
 
 		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
 		    nv, NULL);
 		nvlist_free(nv);
 	}
 
 	/* Allocate the space for the dump */
 	if (error == 0)
 		error = zvol_prealloc(zv);
 	return (error);
 }
 
 static int
 zvol_dumpify(zvol_state_t *zv)
 {
 	int error = 0;
 	uint64_t dumpsize = 0;
 	dmu_tx_t *tx;
 	objset_t *os = zv->zv_objset;
 
 	if (zv->zv_flags & ZVOL_RDONLY)
 		return (SET_ERROR(EROFS));
 
 	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
 	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
 		boolean_t resize = (dumpsize > 0);
 
 		if ((error = zvol_dump_init(zv, resize)) != 0) {
 			(void) zvol_dump_fini(zv);
 			return (error);
 		}
 	}
 
 	/*
 	 * Build up our lba mapping.
 	 */
 	error = zvol_get_lbas(zv);
 	if (error) {
 		(void) zvol_dump_fini(zv);
 		return (error);
 	}
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		(void) zvol_dump_fini(zv);
 		return (error);
 	}
 
 	zv->zv_flags |= ZVOL_DUMPIFIED;
 	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
 	    &zv->zv_volsize, tx);
 	dmu_tx_commit(tx);
 
 	if (error) {
 		(void) zvol_dump_fini(zv);
 		return (error);
 	}
 
 	txg_wait_synced(dmu_objset_pool(os), 0);
 	return (0);
 }
 
 static int
 zvol_dump_fini(zvol_state_t *zv)
 {
 	dmu_tx_t *tx;
 	objset_t *os = zv->zv_objset;
 	nvlist_t *nv;
 	int error = 0;
 	uint64_t checksum, compress, refresrv, vbs, dedup;
 	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
 
 	/*
 	 * Attempt to restore the zvol back to its pre-dumpified state.
 	 * This is a best-effort attempt as it's possible that not all
 	 * of these properties were initialized during the dumpify process
 	 * (i.e. error during zvol_dump_init).
 	 */
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
 	dmu_tx_commit(tx);
 
 	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
 	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
 	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
 	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
 	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
 	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
 
 	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	(void) nvlist_add_uint64(nv,
 	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
 	(void) nvlist_add_uint64(nv,
 	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
 	(void) nvlist_add_uint64(nv,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
 	if (version >= SPA_VERSION_DEDUP &&
 	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
 	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
 		(void) nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
 	}
 	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
 	    nv, NULL);
 	nvlist_free(nv);
 
 	zvol_free_extents(zv);
 	zv->zv_flags &= ~ZVOL_DUMPIFIED;
 	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
 	/* wait for dmu_free_long_range to actually free the blocks */
 	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
 		zv->zv_volblocksize = vbs;
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 #else	/* !illumos */
 
 static void
 zvol_geom_run(zvol_state_t *zv)
 {
 	struct g_provider *pp;
 
 	pp = zv->zv_provider;
 	g_error_provider(pp, 0);
 
 	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
 	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
 }
 
 static void
 zvol_geom_destroy(zvol_state_t *zv)
 {
 	struct g_provider *pp;
 
 	g_topology_assert();
 
 	mtx_lock(&zv->zv_queue_mtx);
 	zv->zv_state = 1;
 	wakeup_one(&zv->zv_queue);
 	while (zv->zv_state != 2)
 		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
 	mtx_destroy(&zv->zv_queue_mtx);
 
 	pp = zv->zv_provider;
 	zv->zv_provider = NULL;
 	pp->private = NULL;
 	g_wither_geom(pp->geom, ENXIO);
 }
 
 static int
 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	int count, error, flags;
 
 	g_topology_assert();
 
 	/*
 	 * To make it easier we expect either open or close, but not both
 	 * at the same time.
 	 */
 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
 	    (acr <= 0 && acw <= 0 && ace <= 0),
 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
 	    pp->name, acr, acw, ace));
 
 	if (pp->private == NULL) {
 		if (acr <= 0 && acw <= 0 && ace <= 0)
 			return (0);
 		return (pp->error);
 	}
 
 	/*
 	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
 	 * because GEOM already handles that and handles it a bit differently.
 	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
 	 * only one exclusive consumer, no matter if it is reader or writer.
 	 * I like better the way GEOM works so I'll leave it for GEOM to
 	 * decide what to do.
 	 */
 
 	count = acr + acw + ace;
 	if (count == 0)
 		return (0);
 
 	flags = 0;
 	if (acr != 0 || ace != 0)
 		flags |= FREAD;
 	if (acw != 0)
 		flags |= FWRITE;
 
 	g_topology_unlock();
 	if (count > 0)
 		error = zvol_open(pp, flags, count);
 	else
 		error = zvol_close(pp, flags, -count);
 	g_topology_lock();
 	return (error);
 }
 
 static void
 zvol_geom_start(struct bio *bp)
 {
 	zvol_state_t *zv;
 	boolean_t first;
 
 	zv = bp->bio_to->private;
 	ASSERT(zv != NULL);
 	switch (bp->bio_cmd) {
 	case BIO_FLUSH:
 		if (!THREAD_CAN_SLEEP())
 			goto enqueue;
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		g_io_deliver(bp, 0);
 		break;
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		if (!THREAD_CAN_SLEEP())
 			goto enqueue;
 		zvol_strategy(bp);
 		break;
 	case BIO_GETATTR: {
 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
 		uint64_t refd, avail, usedobjs, availobjs, val;
 
 		if (g_handleattr_int(bp, "GEOM::candelete", 1))
 			return;
 		if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
 			dmu_objset_space(zv->zv_objset, &refd, &avail,
 			    &usedobjs, &availobjs);
 			if (g_handleattr_off_t(bp, "blocksavail",
 			    avail / DEV_BSIZE))
 				return;
 		} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
 			dmu_objset_space(zv->zv_objset, &refd, &avail,
 			    &usedobjs, &availobjs);
 			if (g_handleattr_off_t(bp, "blocksused",
 			    refd / DEV_BSIZE))
 				return;
 		} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
 			avail = metaslab_class_get_space(spa_normal_class(spa));
 			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
 			if (g_handleattr_off_t(bp, "poolblocksavail",
 			    avail / DEV_BSIZE))
 				return;
 		} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
 			if (g_handleattr_off_t(bp, "poolblocksused",
 			    refd / DEV_BSIZE))
 				return;
 		}
 		/* FALLTHROUGH */
 	}
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		break;
 	}
 	return;
 
 enqueue:
 	mtx_lock(&zv->zv_queue_mtx);
 	first = (bioq_first(&zv->zv_queue) == NULL);
 	bioq_insert_tail(&zv->zv_queue, bp);
 	mtx_unlock(&zv->zv_queue_mtx);
 	if (first)
 		wakeup_one(&zv->zv_queue);
 }
 
 static void
 zvol_geom_worker(void *arg)
 {
 	zvol_state_t *zv;
 	struct bio *bp;
 
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	zv = arg;
 	for (;;) {
 		mtx_lock(&zv->zv_queue_mtx);
 		bp = bioq_takefirst(&zv->zv_queue);
 		if (bp == NULL) {
 			if (zv->zv_state == 1) {
 				zv->zv_state = 2;
 				wakeup(&zv->zv_state);
 				mtx_unlock(&zv->zv_queue_mtx);
 				kthread_exit();
 			}
 			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
 			    "zvol:io", 0);
 			continue;
 		}
 		mtx_unlock(&zv->zv_queue_mtx);
 		switch (bp->bio_cmd) {
 		case BIO_FLUSH:
 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
 			g_io_deliver(bp, 0);
 			break;
 		case BIO_READ:
 		case BIO_WRITE:
 		case BIO_DELETE:
 			zvol_strategy(bp);
 			break;
 		default:
 			g_io_deliver(bp, EOPNOTSUPP);
 			break;
 		}
 	}
 }
 
 extern boolean_t dataset_name_hidden(const char *name);
 
 static int
 zvol_create_snapshots(objset_t *os, const char *name)
 {
 	uint64_t cookie, obj;
 	char *sname;
 	int error, len;
 
 	cookie = obj = 0;
 	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 #if 0
 	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
 	    DS_FIND_SNAPSHOTS);
 #endif
 
 	for (;;) {
 		len = snprintf(sname, MAXPATHLEN, "%s@", name);
 		if (len >= MAXPATHLEN) {
 			dmu_objset_rele(os, FTAG);
 			error = ENAMETOOLONG;
 			break;
 		}
 
 		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
 		    sname + len, &obj, &cookie, NULL);
 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		if (error != 0) {
 			if (error == ENOENT)
 				error = 0;
 			break;
 		}
 
 		if ((error = zvol_create_minor(sname)) != 0) {
 			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
 			    sname, error);
 			break;
 		}
 	}
 
 	kmem_free(sname, MAXPATHLEN);
 	return (error);
 }
 
 int
 zvol_create_minors(const char *name)
 {
 	uint64_t cookie;
 	objset_t *os;
 	char *osname, *p;
 	int error, len;
 
 	if (dataset_name_hidden(name))
 		return (0);
 
 	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
 		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
 		    name, error);
 		return (error);
 	}
 	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
 		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
 		dsl_pool_rele(dmu_objset_pool(os), FTAG);
 		error = zvol_create_minor(name);
 		if (error == 0 || error == EEXIST) {
 			error = zvol_create_snapshots(os, name);
 		} else {
 			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
 			    name, error);
 		}
 		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
 		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
 		return (error);
 	}
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele(os, FTAG);
 		return (0);
 	}
 
 	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
 		dmu_objset_rele(os, FTAG);
 		kmem_free(osname, MAXPATHLEN);
 		return (ENOENT);
 	}
 	p = osname + strlen(osname);
 	len = MAXPATHLEN - (p - osname);
 
 #if 0
 	/* Prefetch the datasets. */
 	cookie = 0;
 	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
 		if (!dataset_name_hidden(osname))
 			(void) dmu_objset_prefetch(osname, NULL);
 	}
 #endif
 
 	cookie = 0;
 	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
 	    &cookie) == 0) {
 		dmu_objset_rele(os, FTAG);
 		(void)zvol_create_minors(osname);
 		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
 			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
 			    name, error);
 			return (error);
 		}
 	}
 
 	dmu_objset_rele(os, FTAG);
 	kmem_free(osname, MAXPATHLEN);
 	return (0);
 }
 
 static void
 zvol_rename_minor(zvol_state_t *zv, const char *newname)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct cdev *dev;
 
 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		g_topology_lock();
 		pp = zv->zv_provider;
 		ASSERT(pp != NULL);
 		gp = pp->geom;
 		ASSERT(gp != NULL);
 
 		zv->zv_provider = NULL;
 		g_wither_provider(pp, ENXIO);
 
 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
 		pp->sectorsize = DEV_BSIZE;
 		pp->mediasize = zv->zv_volsize;
 		pp->private = zv;
 		zv->zv_provider = pp;
 		g_error_provider(pp, 0);
 		g_topology_unlock();
 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 		dev = zv->zv_dev;
 		ASSERT(dev != NULL);
 		zv->zv_dev = NULL;
 		destroy_dev(dev);
 
 		if (make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
 		    &dev, &zvol_cdevsw, NULL, UID_ROOT, GID_OPERATOR,
 		    0640, "%s/%s", ZVOL_DRIVER, newname) == 0) {
 			zv->zv_dev = dev;
 			dev->si_iosize_max = MAXPHYS;
 			dev->si_drv2 = zv;
 		}
 	}
 	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
 }
 
 void
 zvol_rename_minors(const char *oldname, const char *newname)
 {
 	char name[MAXPATHLEN];
 	struct g_provider *pp;
 	struct g_geom *gp;
 	size_t oldnamelen, newnamelen;
 	zvol_state_t *zv;
 	char *namebuf;
 	boolean_t locked = B_FALSE;
 
 	oldnamelen = strlen(oldname);
 	newnamelen = strlen(newname);
 
 	DROP_GIANT();
 	/* See comment in zvol_open(). */
 	if (!MUTEX_HELD(&zfsdev_state_lock)) {
 		mutex_enter(&zfsdev_state_lock);
 		locked = B_TRUE;
 	}
 
 	LIST_FOREACH(zv, &all_zvols, zv_links) {
 		if (strcmp(zv->zv_name, oldname) == 0) {
 			zvol_rename_minor(zv, newname);
 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
 		    (zv->zv_name[oldnamelen] == '/' ||
 		     zv->zv_name[oldnamelen] == '@')) {
 			snprintf(name, sizeof(name), "%s%c%s", newname,
 			    zv->zv_name[oldnamelen],
 			    zv->zv_name + oldnamelen + 1);
 			zvol_rename_minor(zv, name);
 		}
 	}
 
 	if (locked)
 		mutex_exit(&zfsdev_state_lock);
 	PICKUP_GIANT();
 }
 
 static int
 zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	zvol_state_t *zv;
 	int err = 0;
 
 	mutex_enter(&zfsdev_state_lock);
 	zv = dev->si_drv2;
 	if (zv == NULL) {
 		mutex_exit(&zfsdev_state_lock);
 		return(ENXIO);		/* zvol_create_minor() not done yet */
 	}
 
 	if (zv->zv_total_opens == 0)
 		err = zvol_first_open(zv);
 	if (err) {
 		mutex_exit(&zfsdev_state_lock);
 		return (err);
 	}
 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 		err = SET_ERROR(EROFS);
 		goto out;
 	}
 	if (zv->zv_flags & ZVOL_EXCL) {
 		err = SET_ERROR(EBUSY);
 		goto out;
 	}
 #ifdef FEXCL
 	if (flags & FEXCL) {
 		if (zv->zv_total_opens != 0) {
 			err = SET_ERROR(EBUSY);
 			goto out;
 		}
 		zv->zv_flags |= ZVOL_EXCL;
 	}
 #endif
 
 	zv->zv_total_opens++;
 	mutex_exit(&zfsdev_state_lock);
 	return (err);
 out:
 	if (zv->zv_total_opens == 0)
 		zvol_last_close(zv);
 	mutex_exit(&zfsdev_state_lock);
 	return (err);
 }
 
 static int
 zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	zvol_state_t *zv;
 	int err = 0;
 
 	mutex_enter(&zfsdev_state_lock);
 	zv = dev->si_drv2;
 	if (zv == NULL) {
 		mutex_exit(&zfsdev_state_lock);
 		return(ENXIO);
 	}
 
 	if (zv->zv_flags & ZVOL_EXCL) {
 		ASSERT(zv->zv_total_opens == 1);
 		zv->zv_flags &= ~ZVOL_EXCL;
 	}
 
 	/*
 	 * If the open count is zero, this is a spurious close.
 	 * That indicates a bug in the kernel / DDI framework.
 	 */
 	ASSERT(zv->zv_total_opens != 0);
 
 	/*
 	 * You may get multiple opens, but only one close.
 	 */
 	zv->zv_total_opens--;
 
 	if (zv->zv_total_opens == 0)
 		zvol_last_close(zv);
 
 	mutex_exit(&zfsdev_state_lock);
 	return (0);
 }
 
 static int
 zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	zvol_state_t *zv;
 	rl_t *rl;
 	off_t offset, length, chunk;
 	int i, error;
 	u_int u;
 
 	zv = dev->si_drv2;
 
 	error = 0;
 	KASSERT(zv->zv_total_opens > 0,
 	    ("Device with zero access count in zvol_d_ioctl"));
 
 	i = IOCPARM_LEN(cmd);
 	switch (cmd) {
 	case DIOCGSECTORSIZE:
 		*(u_int *)data = DEV_BSIZE;
 		break;
 	case DIOCGMEDIASIZE:
 		*(off_t *)data = zv->zv_volsize;
 		break;
 	case DIOCGFLUSH:
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		break;
 	case DIOCGDELETE:
 		if (!zvol_unmap_enabled)
 			break;
 
 		offset = ((off_t *)data)[0];
 		length = ((off_t *)data)[1];
 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
 		    offset < 0 || offset >= zv->zv_volsize ||
 		    length <= 0) {
 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
 			    length);
 			error = EINVAL;
 			break;
 		}
 
 		rl = zfs_range_lock(&zv->zv_znode, offset, length, RL_WRITER);
 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error != 0) {
 			dmu_tx_abort(tx);
 		} else {
 			zvol_log_truncate(zv, tx, offset, length, B_TRUE);
 			dmu_tx_commit(tx);
 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
 			    offset, length);
 		}
 		zfs_range_unlock(rl);
 		if (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		break;
 	case DIOCGSTRIPESIZE:
 		*(off_t *)data = zv->zv_volblocksize;
 		break;
 	case DIOCGSTRIPEOFFSET:
 		*(off_t *)data = 0;
 		break;
 	case DIOCGATTR: {
 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
 		uint64_t refd, avail, usedobjs, availobjs;
 
 		if (strcmp(arg->name, "GEOM::candelete") == 0)
 			arg->value.i = 1;
 		else if (strcmp(arg->name, "blocksavail") == 0) {
 			dmu_objset_space(zv->zv_objset, &refd, &avail,
 			    &usedobjs, &availobjs);
 			arg->value.off = avail / DEV_BSIZE;
 		} else if (strcmp(arg->name, "blocksused") == 0) {
 			dmu_objset_space(zv->zv_objset, &refd, &avail,
 			    &usedobjs, &availobjs);
 			arg->value.off = refd / DEV_BSIZE;
 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
 			avail = metaslab_class_get_space(spa_normal_class(spa));
 			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
 			arg->value.off = avail / DEV_BSIZE;
 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
 			arg->value.off = refd / DEV_BSIZE;
 		} else
 			error = ENOIOCTL;
 		break;
 	}
 	case FIOSEEKHOLE:
 	case FIOSEEKDATA: {
 		off_t *off = (off_t *)data;
 		uint64_t noff;
 		boolean_t hole;
 
 		hole = (cmd == FIOSEEKHOLE);
 		noff = *off;
 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
 		*off = noff;
 		break;
 	}
 	default:
 		error = ENOIOCTL;
 	}
 
 	return (error);
 }
 #endif	/* illumos */
Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h	(revision 296519)
@@ -1,968 +1,969 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_FS_ZFS_H
 #define	_SYS_FS_ZFS_H
 
 #include <sys/types.h>
 #include <sys/ioccom.h>
 #include <sys/time.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Types and constants shared between userland and the kernel.
  */
 
 /*
  * Each dataset can be one of the following types.  These constants can be
  * combined into masks that can be passed to various functions.
  */
 typedef enum {
 	ZFS_TYPE_FILESYSTEM	= (1 << 0),
 	ZFS_TYPE_SNAPSHOT	= (1 << 1),
 	ZFS_TYPE_VOLUME		= (1 << 2),
 	ZFS_TYPE_POOL		= (1 << 3),
 	ZFS_TYPE_BOOKMARK	= (1 << 4)
 } zfs_type_t;
 
 typedef enum dmu_objset_type {
 	DMU_OST_NONE,
 	DMU_OST_META,
 	DMU_OST_ZFS,
 	DMU_OST_ZVOL,
 	DMU_OST_OTHER,			/* For testing only! */
 	DMU_OST_ANY,			/* Be careful! */
 	DMU_OST_NUMTYPES
 } dmu_objset_type_t;
 
 #define	ZFS_TYPE_DATASET	\
 	(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
 
 #define	ZAP_MAXNAMELEN 256
 #define	ZAP_MAXVALUELEN (1024 * 8)
 #define	ZAP_OLDMAXVALUELEN 1024
 
 /*
  * Dataset properties are identified by these constants and must be added to
  * the end of this list to ensure that external consumers are not affected
  * by the change. If you make any changes to this list, be sure to update
  * the property table in usr/src/common/zfs/zfs_prop.c.
  */
 typedef enum {
 	ZFS_PROP_TYPE,
 	ZFS_PROP_CREATION,
 	ZFS_PROP_USED,
 	ZFS_PROP_AVAILABLE,
 	ZFS_PROP_REFERENCED,
 	ZFS_PROP_COMPRESSRATIO,
 	ZFS_PROP_MOUNTED,
 	ZFS_PROP_ORIGIN,
 	ZFS_PROP_QUOTA,
 	ZFS_PROP_RESERVATION,
 	ZFS_PROP_VOLSIZE,
 	ZFS_PROP_VOLBLOCKSIZE,
 	ZFS_PROP_RECORDSIZE,
 	ZFS_PROP_MOUNTPOINT,
 	ZFS_PROP_SHARENFS,
 	ZFS_PROP_CHECKSUM,
 	ZFS_PROP_COMPRESSION,
 	ZFS_PROP_ATIME,
 	ZFS_PROP_DEVICES,
 	ZFS_PROP_EXEC,
 	ZFS_PROP_SETUID,
 	ZFS_PROP_READONLY,
 	ZFS_PROP_ZONED,
 	ZFS_PROP_SNAPDIR,
 	ZFS_PROP_ACLMODE,
 	ZFS_PROP_ACLINHERIT,
 	ZFS_PROP_CREATETXG,		/* not exposed to the user */
 	ZFS_PROP_NAME,			/* not exposed to the user */
 	ZFS_PROP_CANMOUNT,
 	ZFS_PROP_ISCSIOPTIONS,		/* not exposed to the user */
 	ZFS_PROP_XATTR,
 	ZFS_PROP_NUMCLONES,		/* not exposed to the user */
 	ZFS_PROP_COPIES,
 	ZFS_PROP_VERSION,
 	ZFS_PROP_UTF8ONLY,
 	ZFS_PROP_NORMALIZE,
 	ZFS_PROP_CASE,
 	ZFS_PROP_VSCAN,
 	ZFS_PROP_NBMAND,
 	ZFS_PROP_SHARESMB,
 	ZFS_PROP_REFQUOTA,
 	ZFS_PROP_REFRESERVATION,
 	ZFS_PROP_GUID,
 	ZFS_PROP_PRIMARYCACHE,
 	ZFS_PROP_SECONDARYCACHE,
 	ZFS_PROP_USEDSNAP,
 	ZFS_PROP_USEDDS,
 	ZFS_PROP_USEDCHILD,
 	ZFS_PROP_USEDREFRESERV,
 	ZFS_PROP_USERACCOUNTING,	/* not exposed to the user */
 	ZFS_PROP_STMF_SHAREINFO,	/* not exposed to the user */
 	ZFS_PROP_DEFER_DESTROY,
 	ZFS_PROP_USERREFS,
 	ZFS_PROP_LOGBIAS,
 	ZFS_PROP_UNIQUE,		/* not exposed to the user */
 	ZFS_PROP_OBJSETID,		/* not exposed to the user */
 	ZFS_PROP_DEDUP,
 	ZFS_PROP_MLSLABEL,
 	ZFS_PROP_SYNC,
 	ZFS_PROP_REFRATIO,
 	ZFS_PROP_WRITTEN,
 	ZFS_PROP_CLONES,
 	ZFS_PROP_LOGICALUSED,
 	ZFS_PROP_LOGICALREFERENCED,
 	ZFS_PROP_INCONSISTENT,		/* not exposed to the user */
 	ZFS_PROP_VOLMODE,
 	ZFS_PROP_FILESYSTEM_LIMIT,
 	ZFS_PROP_SNAPSHOT_LIMIT,
 	ZFS_PROP_FILESYSTEM_COUNT,
 	ZFS_PROP_SNAPSHOT_COUNT,
 	ZFS_PROP_REDUNDANT_METADATA,
 	ZFS_PROP_PREV_SNAP,
 	ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
 typedef enum {
 	ZFS_PROP_USERUSED,
 	ZFS_PROP_USERQUOTA,
 	ZFS_PROP_GROUPUSED,
 	ZFS_PROP_GROUPQUOTA,
 	ZFS_NUM_USERQUOTA_PROPS
 } zfs_userquota_prop_t;
 
 extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
 
 /*
  * Pool properties are identified by these constants and must be added to the
  * end of this list to ensure that external consumers are not affected
  * by the change. If you make any changes to this list, be sure to update
  * the property table in usr/src/common/zfs/zpool_prop.c.
  */
 typedef enum {
 	ZPOOL_PROP_NAME,
 	ZPOOL_PROP_SIZE,
 	ZPOOL_PROP_CAPACITY,
 	ZPOOL_PROP_ALTROOT,
 	ZPOOL_PROP_HEALTH,
 	ZPOOL_PROP_GUID,
 	ZPOOL_PROP_VERSION,
 	ZPOOL_PROP_BOOTFS,
 	ZPOOL_PROP_DELEGATION,
 	ZPOOL_PROP_AUTOREPLACE,
 	ZPOOL_PROP_CACHEFILE,
 	ZPOOL_PROP_FAILUREMODE,
 	ZPOOL_PROP_LISTSNAPS,
 	ZPOOL_PROP_AUTOEXPAND,
 	ZPOOL_PROP_DEDUPDITTO,
 	ZPOOL_PROP_DEDUPRATIO,
 	ZPOOL_PROP_FREE,
 	ZPOOL_PROP_ALLOCATED,
 	ZPOOL_PROP_READONLY,
 	ZPOOL_PROP_COMMENT,
 	ZPOOL_PROP_EXPANDSZ,
 	ZPOOL_PROP_FREEING,
 	ZPOOL_PROP_FRAGMENTATION,
 	ZPOOL_PROP_LEAKED,
 	ZPOOL_PROP_MAXBLOCKSIZE,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
 /* Small enough to not hog a whole line of printout in zpool(1M). */
 #define	ZPROP_MAX_COMMENT	32
 
 #define	ZPROP_CONT		-2
 #define	ZPROP_INVAL		-1
 
 #define	ZPROP_VALUE		"value"
 #define	ZPROP_SOURCE		"source"
 
 typedef enum {
 	ZPROP_SRC_NONE = 0x1,
 	ZPROP_SRC_DEFAULT = 0x2,
 	ZPROP_SRC_TEMPORARY = 0x4,
 	ZPROP_SRC_LOCAL = 0x8,
 	ZPROP_SRC_INHERITED = 0x10,
 	ZPROP_SRC_RECEIVED = 0x20
 } zprop_source_t;
 
 #define	ZPROP_SRC_ALL	0x3f
 
 #define	ZPROP_SOURCE_VAL_RECVD	"$recvd"
 #define	ZPROP_N_MORE_ERRORS	"N_MORE_ERRORS"
 /*
  * Dataset flag implemented as a special entry in the props zap object
  * indicating that the dataset has received properties on or after
  * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
  * just as it did in earlier versions, and thereafter, local properties are
  * preserved.
  */
 #define	ZPROP_HAS_RECVD		"$hasrecvd"
 
 typedef enum {
 	ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
 	ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
 } zprop_errflags_t;
 
 typedef int (*zprop_func)(int, void *);
 
 /*
  * Properties to be set on the root file system of a new pool
  * are stuffed into their own nvlist, which is then included in
  * the properties nvlist with the pool properties.
  */
 #define	ZPOOL_ROOTFS_PROPS	"root-props-nvl"
 
 /*
  * Dataset property functions shared between libzfs and kernel.
  */
 const char *zfs_prop_default_string(zfs_prop_t);
 uint64_t zfs_prop_default_numeric(zfs_prop_t);
 boolean_t zfs_prop_readonly(zfs_prop_t);
 boolean_t zfs_prop_inheritable(zfs_prop_t);
 boolean_t zfs_prop_setonce(zfs_prop_t);
 const char *zfs_prop_to_name(zfs_prop_t);
 zfs_prop_t zfs_name_to_prop(const char *);
 boolean_t zfs_prop_user(const char *);
 boolean_t zfs_prop_userquota(const char *);
 int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
 int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
 uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
 boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
 
 /*
  * Pool property functions shared between libzfs and kernel.
  */
 zpool_prop_t zpool_name_to_prop(const char *);
 const char *zpool_prop_to_name(zpool_prop_t);
 const char *zpool_prop_default_string(zpool_prop_t);
 uint64_t zpool_prop_default_numeric(zpool_prop_t);
 boolean_t zpool_prop_readonly(zpool_prop_t);
 boolean_t zpool_prop_feature(const char *);
 boolean_t zpool_prop_unsupported(const char *name);
 int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
 int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
 uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
 
 /*
  * Definitions for the Delegation.
  */
 typedef enum {
 	ZFS_DELEG_WHO_UNKNOWN = 0,
 	ZFS_DELEG_USER = 'u',
 	ZFS_DELEG_USER_SETS = 'U',
 	ZFS_DELEG_GROUP = 'g',
 	ZFS_DELEG_GROUP_SETS = 'G',
 	ZFS_DELEG_EVERYONE = 'e',
 	ZFS_DELEG_EVERYONE_SETS = 'E',
 	ZFS_DELEG_CREATE = 'c',
 	ZFS_DELEG_CREATE_SETS = 'C',
 	ZFS_DELEG_NAMED_SET = 's',
 	ZFS_DELEG_NAMED_SET_SETS = 'S'
 } zfs_deleg_who_type_t;
 
 typedef enum {
 	ZFS_DELEG_NONE = 0,
 	ZFS_DELEG_PERM_LOCAL = 1,
 	ZFS_DELEG_PERM_DESCENDENT = 2,
 	ZFS_DELEG_PERM_LOCALDESCENDENT = 3,
 	ZFS_DELEG_PERM_CREATE = 4
 } zfs_deleg_inherit_t;
 
 #define	ZFS_DELEG_PERM_UID	"uid"
 #define	ZFS_DELEG_PERM_GID	"gid"
 #define	ZFS_DELEG_PERM_GROUPS	"groups"
 
 #define	ZFS_MLSLABEL_DEFAULT	"none"
 
 #define	ZFS_SMB_ACL_SRC		"src"
 #define	ZFS_SMB_ACL_TARGET	"target"
 
 typedef enum {
 	ZFS_CANMOUNT_OFF = 0,
 	ZFS_CANMOUNT_ON = 1,
 	ZFS_CANMOUNT_NOAUTO = 2
 } zfs_canmount_type_t;
 
 typedef enum {
 	ZFS_LOGBIAS_LATENCY = 0,
 	ZFS_LOGBIAS_THROUGHPUT = 1
 } zfs_logbias_op_t;
 
 typedef enum zfs_share_op {
 	ZFS_SHARE_NFS = 0,
 	ZFS_UNSHARE_NFS = 1,
 	ZFS_SHARE_SMB = 2,
 	ZFS_UNSHARE_SMB = 3
 } zfs_share_op_t;
 
 typedef enum zfs_smb_acl_op {
 	ZFS_SMB_ACL_ADD,
 	ZFS_SMB_ACL_REMOVE,
 	ZFS_SMB_ACL_RENAME,
 	ZFS_SMB_ACL_PURGE
 } zfs_smb_acl_op_t;
 
 typedef enum zfs_cache_type {
 	ZFS_CACHE_NONE = 0,
 	ZFS_CACHE_METADATA = 1,
 	ZFS_CACHE_ALL = 2
 } zfs_cache_type_t;
 
 typedef enum {
 	ZFS_SYNC_STANDARD = 0,
 	ZFS_SYNC_ALWAYS = 1,
 	ZFS_SYNC_DISABLED = 2
 } zfs_sync_type_t;
 
 typedef enum {
 	ZFS_VOLMODE_DEFAULT = 0,
 	ZFS_VOLMODE_GEOM = 1,
 	ZFS_VOLMODE_DEV = 2,
 	ZFS_VOLMODE_NONE = 3
 } zfs_volmode_t;
 
 typedef enum {
 	ZFS_REDUNDANT_METADATA_ALL,
 	ZFS_REDUNDANT_METADATA_MOST
 } zfs_redundant_metadata_type_t;
 
 /*
  * On-disk version number.
  */
 #define	SPA_VERSION_1			1ULL
 #define	SPA_VERSION_2			2ULL
 #define	SPA_VERSION_3			3ULL
 #define	SPA_VERSION_4			4ULL
 #define	SPA_VERSION_5			5ULL
 #define	SPA_VERSION_6			6ULL
 #define	SPA_VERSION_7			7ULL
 #define	SPA_VERSION_8			8ULL
 #define	SPA_VERSION_9			9ULL
 #define	SPA_VERSION_10			10ULL
 #define	SPA_VERSION_11			11ULL
 #define	SPA_VERSION_12			12ULL
 #define	SPA_VERSION_13			13ULL
 #define	SPA_VERSION_14			14ULL
 #define	SPA_VERSION_15			15ULL
 #define	SPA_VERSION_16			16ULL
 #define	SPA_VERSION_17			17ULL
 #define	SPA_VERSION_18			18ULL
 #define	SPA_VERSION_19			19ULL
 #define	SPA_VERSION_20			20ULL
 #define	SPA_VERSION_21			21ULL
 #define	SPA_VERSION_22			22ULL
 #define	SPA_VERSION_23			23ULL
 #define	SPA_VERSION_24			24ULL
 #define	SPA_VERSION_25			25ULL
 #define	SPA_VERSION_26			26ULL
 #define	SPA_VERSION_27			27ULL
 #define	SPA_VERSION_28			28ULL
 #define	SPA_VERSION_5000		5000ULL
 
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
 #define	SPA_VERSION			SPA_VERSION_5000
 #define	SPA_VERSION_STRING		"5000"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
  * Used in the code when checking for presence or absence of a feature.
  * Feel free to define multiple symbolic names for each version if there
  * were multiple changes to on-disk structures during that version.
  *
  * NOTE: When checking the current SPA_VERSION in your code, be sure
  *       to use spa_version() since it reports the version of the
  *       last synced uberblock.  Checking the in-flight version can
  *       be dangerous in some cases.
  */
 #define	SPA_VERSION_INITIAL		SPA_VERSION_1
 #define	SPA_VERSION_DITTO_BLOCKS	SPA_VERSION_2
 #define	SPA_VERSION_SPARES		SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ2		SPA_VERSION_3
 #define	SPA_VERSION_BPOBJ_ACCOUNT	SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ_DEFLATE	SPA_VERSION_3
 #define	SPA_VERSION_DNODE_BYTES		SPA_VERSION_3
 #define	SPA_VERSION_ZPOOL_HISTORY	SPA_VERSION_4
 #define	SPA_VERSION_GZIP_COMPRESSION	SPA_VERSION_5
 #define	SPA_VERSION_BOOTFS		SPA_VERSION_6
 #define	SPA_VERSION_SLOGS		SPA_VERSION_7
 #define	SPA_VERSION_DELEGATED_PERMS	SPA_VERSION_8
 #define	SPA_VERSION_FUID		SPA_VERSION_9
 #define	SPA_VERSION_REFRESERVATION	SPA_VERSION_9
 #define	SPA_VERSION_REFQUOTA		SPA_VERSION_9
 #define	SPA_VERSION_UNIQUE_ACCURATE	SPA_VERSION_9
 #define	SPA_VERSION_L2CACHE		SPA_VERSION_10
 #define	SPA_VERSION_NEXT_CLONES		SPA_VERSION_11
 #define	SPA_VERSION_ORIGIN		SPA_VERSION_11
 #define	SPA_VERSION_DSL_SCRUB		SPA_VERSION_11
 #define	SPA_VERSION_SNAP_PROPS		SPA_VERSION_12
 #define	SPA_VERSION_USED_BREAKDOWN	SPA_VERSION_13
 #define	SPA_VERSION_PASSTHROUGH_X	SPA_VERSION_14
 #define	SPA_VERSION_USERSPACE		SPA_VERSION_15
 #define	SPA_VERSION_STMF_PROP		SPA_VERSION_16
 #define	SPA_VERSION_RAIDZ3		SPA_VERSION_17
 #define	SPA_VERSION_USERREFS		SPA_VERSION_18
 #define	SPA_VERSION_HOLES		SPA_VERSION_19
 #define	SPA_VERSION_ZLE_COMPRESSION	SPA_VERSION_20
 #define	SPA_VERSION_DEDUP		SPA_VERSION_21
 #define	SPA_VERSION_RECVD_PROPS		SPA_VERSION_22
 #define	SPA_VERSION_SLIM_ZIL		SPA_VERSION_23
 #define	SPA_VERSION_SA			SPA_VERSION_24
 #define	SPA_VERSION_SCAN		SPA_VERSION_25
 #define	SPA_VERSION_DIR_CLONES		SPA_VERSION_26
 #define	SPA_VERSION_DEADLISTS		SPA_VERSION_26
 #define	SPA_VERSION_FAST_SNAP		SPA_VERSION_27
 #define	SPA_VERSION_MULTI_REPLACE	SPA_VERSION_28
 #define	SPA_VERSION_BEFORE_FEATURES	SPA_VERSION_28
 #define	SPA_VERSION_FEATURES		SPA_VERSION_5000
 
 #define	SPA_VERSION_IS_SUPPORTED(v) \
 	(((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
 	((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
  * occurs.  This is independent of SPA/DMU/ZAP versioning.  You must
  * also update the version_table[] and help message in zfs_prop.c.
  *
  * When changing, be sure to teach GRUB how to read the new format!
  * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
  */
 #define	ZPL_VERSION_1			1ULL
 #define	ZPL_VERSION_2			2ULL
 #define	ZPL_VERSION_3			3ULL
 #define	ZPL_VERSION_4			4ULL
 #define	ZPL_VERSION_5			5ULL
 #define	ZPL_VERSION			ZPL_VERSION_5
 #define	ZPL_VERSION_STRING		"5"
 
 #define	ZPL_VERSION_INITIAL		ZPL_VERSION_1
 #define	ZPL_VERSION_DIRENT_TYPE		ZPL_VERSION_2
 #define	ZPL_VERSION_FUID		ZPL_VERSION_3
 #define	ZPL_VERSION_NORMALIZATION	ZPL_VERSION_3
 #define	ZPL_VERSION_SYSATTR		ZPL_VERSION_3
 #define	ZPL_VERSION_USERSPACE		ZPL_VERSION_4
 #define	ZPL_VERSION_SA			ZPL_VERSION_5
 
 /* Rewind request information */
 #define	ZPOOL_NO_REWIND		1  /* No policy - default behavior */
 #define	ZPOOL_NEVER_REWIND	2  /* Do not search for best txg or rewind */
 #define	ZPOOL_TRY_REWIND	4  /* Search for best txg, but do not rewind */
 #define	ZPOOL_DO_REWIND		8  /* Rewind to best txg w/in deferred frees */
 #define	ZPOOL_EXTREME_REWIND	16 /* Allow extreme measures to find best txg */
 #define	ZPOOL_REWIND_MASK	28 /* All the possible rewind bits */
 #define	ZPOOL_REWIND_POLICIES	31 /* All the possible policy bits */
 
 typedef struct zpool_rewind_policy {
 	uint32_t	zrp_request;	/* rewind behavior requested */
 	uint64_t	zrp_maxmeta;	/* max acceptable meta-data errors */
 	uint64_t	zrp_maxdata;	/* max acceptable data errors */
 	uint64_t	zrp_txg;	/* specific txg to load */
 } zpool_rewind_policy_t;
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
  * configuration.
  */
 #define	ZPOOL_CONFIG_VERSION		"version"
 #define	ZPOOL_CONFIG_POOL_NAME		"name"
 #define	ZPOOL_CONFIG_POOL_STATE		"state"
 #define	ZPOOL_CONFIG_POOL_TXG		"txg"
 #define	ZPOOL_CONFIG_POOL_GUID		"pool_guid"
 #define	ZPOOL_CONFIG_CREATE_TXG		"create_txg"
 #define	ZPOOL_CONFIG_TOP_GUID		"top_guid"
 #define	ZPOOL_CONFIG_VDEV_TREE		"vdev_tree"
 #define	ZPOOL_CONFIG_TYPE		"type"
 #define	ZPOOL_CONFIG_CHILDREN		"children"
 #define	ZPOOL_CONFIG_ID			"id"
 #define	ZPOOL_CONFIG_GUID		"guid"
 #define	ZPOOL_CONFIG_PATH		"path"
 #define	ZPOOL_CONFIG_DEVID		"devid"
 #define	ZPOOL_CONFIG_METASLAB_ARRAY	"metaslab_array"
 #define	ZPOOL_CONFIG_METASLAB_SHIFT	"metaslab_shift"
 #define	ZPOOL_CONFIG_ASHIFT		"ashift"
 #define	ZPOOL_CONFIG_ASIZE		"asize"
 #define	ZPOOL_CONFIG_DTL		"DTL"
 #define	ZPOOL_CONFIG_SCAN_STATS		"scan_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_VDEV_STATS		"vdev_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_WHOLE_DISK		"whole_disk"
 #define	ZPOOL_CONFIG_ERRCOUNT		"error_count"
 #define	ZPOOL_CONFIG_NOT_PRESENT	"not_present"
 #define	ZPOOL_CONFIG_SPARES		"spares"
 #define	ZPOOL_CONFIG_IS_SPARE		"is_spare"
 #define	ZPOOL_CONFIG_NPARITY		"nparity"
 #define	ZPOOL_CONFIG_HOSTID		"hostid"
 #define	ZPOOL_CONFIG_HOSTNAME		"hostname"
 #define	ZPOOL_CONFIG_LOADED_TIME	"initial_load_time"
 #define	ZPOOL_CONFIG_UNSPARE		"unspare"
 #define	ZPOOL_CONFIG_PHYS_PATH		"phys_path"
 #define	ZPOOL_CONFIG_IS_LOG		"is_log"
 #define	ZPOOL_CONFIG_L2CACHE		"l2cache"
 #define	ZPOOL_CONFIG_HOLE_ARRAY		"hole_array"
 #define	ZPOOL_CONFIG_VDEV_CHILDREN	"vdev_children"
 #define	ZPOOL_CONFIG_IS_HOLE		"is_hole"
 #define	ZPOOL_CONFIG_DDT_HISTOGRAM	"ddt_histogram"
 #define	ZPOOL_CONFIG_DDT_OBJ_STATS	"ddt_object_stats"
 #define	ZPOOL_CONFIG_DDT_STATS		"ddt_stats"
 #define	ZPOOL_CONFIG_SPLIT		"splitcfg"
 #define	ZPOOL_CONFIG_ORIG_GUID		"orig_guid"
 #define	ZPOOL_CONFIG_SPLIT_GUID		"split_guid"
 #define	ZPOOL_CONFIG_SPLIT_LIST		"guid_list"
 #define	ZPOOL_CONFIG_REMOVING		"removing"
 #define	ZPOOL_CONFIG_RESILVER_TXG	"resilver_txg"
 #define	ZPOOL_CONFIG_COMMENT		"comment"
 #define	ZPOOL_CONFIG_SUSPENDED		"suspended"	/* not stored on disk */
 #define	ZPOOL_CONFIG_TIMESTAMP		"timestamp"	/* not stored on disk */
 #define	ZPOOL_CONFIG_BOOTFS		"bootfs"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MISSING_DEVICES	"missing_vdevs"	/* not stored on disk */
 #define	ZPOOL_CONFIG_LOAD_INFO		"load_info"	/* not stored on disk */
 #define	ZPOOL_CONFIG_REWIND_INFO	"rewind_info"	/* not stored on disk */
 #define	ZPOOL_CONFIG_UNSUP_FEAT		"unsup_feat"	/* not stored on disk */
 #define	ZPOOL_CONFIG_ENABLED_FEAT	"enabled_feat"	/* not stored on disk */
 #define	ZPOOL_CONFIG_CAN_RDONLY		"can_rdonly"	/* not stored on disk */
 #define	ZPOOL_CONFIG_FEATURES_FOR_READ	"features_for_read"
 #define	ZPOOL_CONFIG_FEATURE_STATS	"feature_stats"	/* not stored on disk */
 /*
  * The persistent vdev state is stored as separate values rather than a single
  * 'vdev_state' entry.  This is because a device can be in multiple states, such
  * as offline and degraded.
  */
 #define	ZPOOL_CONFIG_OFFLINE		"offline"
 #define	ZPOOL_CONFIG_FAULTED		"faulted"
 #define	ZPOOL_CONFIG_DEGRADED		"degraded"
 #define	ZPOOL_CONFIG_REMOVED		"removed"
 #define	ZPOOL_CONFIG_FRU		"fru"
 #define	ZPOOL_CONFIG_AUX_STATE		"aux_state"
 
 /* Rewind policy parameters */
 #define	ZPOOL_REWIND_POLICY		"rewind-policy"
 #define	ZPOOL_REWIND_REQUEST		"rewind-request"
 #define	ZPOOL_REWIND_REQUEST_TXG	"rewind-request-txg"
 #define	ZPOOL_REWIND_META_THRESH	"rewind-meta-thresh"
 #define	ZPOOL_REWIND_DATA_THRESH	"rewind-data-thresh"
 
 /* Rewind data discovered */
 #define	ZPOOL_CONFIG_LOAD_TIME		"rewind_txg_ts"
 #define	ZPOOL_CONFIG_LOAD_DATA_ERRORS	"verify_data_errors"
 #define	ZPOOL_CONFIG_REWIND_TIME	"seconds_of_rewind"
 
 #define	VDEV_TYPE_ROOT			"root"
 #define	VDEV_TYPE_MIRROR		"mirror"
 #define	VDEV_TYPE_REPLACING		"replacing"
 #define	VDEV_TYPE_RAIDZ			"raidz"
 #define	VDEV_TYPE_DISK			"disk"
 #define	VDEV_TYPE_FILE			"file"
 #define	VDEV_TYPE_MISSING		"missing"
 #define	VDEV_TYPE_HOLE			"hole"
 #define	VDEV_TYPE_SPARE			"spare"
 #define	VDEV_TYPE_LOG			"log"
 #define	VDEV_TYPE_L2CACHE		"l2cache"
 
 /*
  * This is needed in userland to report the minimum necessary device size.
  */
 #define	SPA_MINDEVSIZE		(64ULL << 20)
 
 /*
  * Set if the fragmentation has not yet been calculated. This can happen
  * because the space maps have not been upgraded or the histogram feature
  * is not enabled.
  */
 #define	ZFS_FRAG_INVALID	UINT64_MAX
 
 /*
  * The location of the pool configuration repository, shared between kernel and
  * userland.
  */
 #define	ZPOOL_CACHE		"/boot/zfs/zpool.cache"
 
 /*
  * vdev states are ordered from least to most healthy.
  * A vdev that's CANT_OPEN or below is considered unusable.
  */
 typedef enum vdev_state {
 	VDEV_STATE_UNKNOWN = 0,	/* Uninitialized vdev			*/
 	VDEV_STATE_CLOSED,	/* Not currently open			*/
 	VDEV_STATE_OFFLINE,	/* Not allowed to open			*/
 	VDEV_STATE_REMOVED,	/* Explicitly removed from system	*/
 	VDEV_STATE_CANT_OPEN,	/* Tried to open, but failed		*/
 	VDEV_STATE_FAULTED,	/* External request to fault device	*/
 	VDEV_STATE_DEGRADED,	/* Replicated vdev with unhealthy kids	*/
 	VDEV_STATE_HEALTHY	/* Presumed good			*/
 } vdev_state_t;
 
 #define	VDEV_STATE_ONLINE	VDEV_STATE_HEALTHY
 
 /*
  * vdev aux states.  When a vdev is in the CANT_OPEN state, the aux field
  * of the vdev stats structure uses these constants to distinguish why.
  */
 typedef enum vdev_aux {
 	VDEV_AUX_NONE,		/* no error				*/
 	VDEV_AUX_OPEN_FAILED,	/* ldi_open_*() or vn_open() failed	*/
 	VDEV_AUX_CORRUPT_DATA,	/* bad label or disk contents		*/
 	VDEV_AUX_NO_REPLICAS,	/* insufficient number of replicas	*/
 	VDEV_AUX_BAD_GUID_SUM,	/* vdev guid sum doesn't match		*/
 	VDEV_AUX_TOO_SMALL,	/* vdev size is too small		*/
 	VDEV_AUX_BAD_LABEL,	/* the label is OK but invalid		*/
 	VDEV_AUX_VERSION_NEWER,	/* on-disk version is too new		*/
 	VDEV_AUX_VERSION_OLDER,	/* on-disk version is too old		*/
 	VDEV_AUX_UNSUP_FEAT,	/* unsupported features			*/
 	VDEV_AUX_SPARED,	/* hot spare used in another pool	*/
 	VDEV_AUX_ERR_EXCEEDED,	/* too many errors			*/
 	VDEV_AUX_IO_FAILURE,	/* experienced I/O failure		*/
 	VDEV_AUX_BAD_LOG,	/* cannot read log chain(s)		*/
 	VDEV_AUX_EXTERNAL,	/* external diagnosis			*/
 	VDEV_AUX_SPLIT_POOL,	/* vdev was split off into another pool	*/
 	VDEV_AUX_ASHIFT_TOO_BIG /* vdev's min block size is too large   */
 } vdev_aux_t;
 
 /*
  * pool state.  The following states are written to disk as part of the normal
  * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE.  The remaining
  * states are software abstractions used at various levels to communicate
  * pool state.
  */
 typedef enum pool_state {
 	POOL_STATE_ACTIVE = 0,		/* In active use		*/
 	POOL_STATE_EXPORTED,		/* Explicitly exported		*/
 	POOL_STATE_DESTROYED,		/* Explicitly destroyed		*/
 	POOL_STATE_SPARE,		/* Reserved for hot spare use	*/
 	POOL_STATE_L2CACHE,		/* Level 2 ARC device		*/
 	POOL_STATE_UNINITIALIZED,	/* Internal spa_t state		*/
 	POOL_STATE_UNAVAIL,		/* Internal libzfs state	*/
 	POOL_STATE_POTENTIALLY_ACTIVE	/* Internal libzfs state	*/
 } pool_state_t;
 
 /*
  * Scan Functions.
  */
 typedef enum pool_scan_func {
 	POOL_SCAN_NONE,
 	POOL_SCAN_SCRUB,
 	POOL_SCAN_RESILVER,
 	POOL_SCAN_FUNCS
 } pool_scan_func_t;
 
 /*
  * ZIO types.  Needed to interpret vdev statistics below.
  */
 typedef enum zio_type {
 	ZIO_TYPE_NULL = 0,
 	ZIO_TYPE_READ,
 	ZIO_TYPE_WRITE,
 	ZIO_TYPE_FREE,
 	ZIO_TYPE_CLAIM,
 	ZIO_TYPE_IOCTL,
 	ZIO_TYPES
 } zio_type_t;
 
 /*
  * Pool statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
  */
 typedef struct pool_scan_stat {
 	/* values stored on disk */
 	uint64_t	pss_func;	/* pool_scan_func_t */
 	uint64_t	pss_state;	/* dsl_scan_state_t */
 	uint64_t	pss_start_time;	/* scan start time */
 	uint64_t	pss_end_time;	/* scan end time */
 	uint64_t	pss_to_examine;	/* total bytes to scan */
 	uint64_t	pss_examined;	/* total examined bytes	*/
 	uint64_t	pss_to_process; /* total bytes to process */
 	uint64_t	pss_processed;	/* total processed bytes */
 	uint64_t	pss_errors;	/* scan errors	*/
 
 	/* values not stored on disk */
 	uint64_t	pss_pass_exam;	/* examined bytes per scan pass */
 	uint64_t	pss_pass_start;	/* start time of a scan pass */
 } pool_scan_stat_t;
 
 typedef enum dsl_scan_state {
 	DSS_NONE,
 	DSS_SCANNING,
 	DSS_FINISHED,
 	DSS_CANCELED,
 	DSS_NUM_STATES
 } dsl_scan_state_t;
 
 
 /*
  * Vdev statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
  */
 typedef struct vdev_stat {
 	hrtime_t	vs_timestamp;		/* time since vdev load	*/
 	uint64_t	vs_state;		/* vdev state		*/
 	uint64_t	vs_aux;			/* see vdev_aux_t	*/
 	uint64_t	vs_alloc;		/* space allocated	*/
 	uint64_t	vs_space;		/* total capacity	*/
 	uint64_t	vs_dspace;		/* deflated capacity	*/
 	uint64_t	vs_rsize;		/* replaceable dev size */
 	uint64_t	vs_esize;		/* expandable dev size */
 	uint64_t	vs_ops[ZIO_TYPES];	/* operation count	*/
 	uint64_t	vs_bytes[ZIO_TYPES];	/* bytes read/written	*/
 	uint64_t	vs_read_errors;		/* read errors		*/
 	uint64_t	vs_write_errors;	/* write errors		*/
 	uint64_t	vs_checksum_errors;	/* checksum errors	*/
 	uint64_t	vs_self_healed;		/* self-healed bytes	*/
 	uint64_t	vs_scan_removing;	/* removing?	*/
 	uint64_t	vs_scan_processed;	/* scan processed bytes	*/
  	uint64_t	vs_configured_ashift;	/* TLV vdev_ashift      */
  	uint64_t	vs_logical_ashift;	/* vdev_logical_ashift  */
  	uint64_t	vs_physical_ashift;	/* vdev_physical_ashift */
 	uint64_t	vs_fragmentation;	/* device fragmentation */
 } vdev_stat_t;
 #define VDEV_STAT_VALID(field, uint64_t_field_count) \
     ((uint64_t_field_count * sizeof(uint64_t)) >= \
      (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field)))
 
 /*
  * DDT statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
  */
 typedef struct ddt_object {
 	uint64_t	ddo_count;	/* number of elments in ddt 	*/
 	uint64_t	ddo_dspace;	/* size of ddt on disk		*/
 	uint64_t	ddo_mspace;	/* size of ddt in-core		*/
 } ddt_object_t;
 
 typedef struct ddt_stat {
 	uint64_t	dds_blocks;	/* blocks			*/
 	uint64_t	dds_lsize;	/* logical size			*/
 	uint64_t	dds_psize;	/* physical size		*/
 	uint64_t	dds_dsize;	/* deflated allocated size	*/
 	uint64_t	dds_ref_blocks;	/* referenced blocks		*/
 	uint64_t	dds_ref_lsize;	/* referenced lsize * refcnt	*/
 	uint64_t	dds_ref_psize;	/* referenced psize * refcnt	*/
 	uint64_t	dds_ref_dsize;	/* referenced dsize * refcnt	*/
 } ddt_stat_t;
 
 typedef struct ddt_histogram {
 	ddt_stat_t	ddh_stat[64];	/* power-of-two histogram buckets */
 } ddt_histogram_t;
 
 #define	ZVOL_DRIVER	"zvol"
 #define	ZFS_DRIVER	"zfs"
 #define	ZFS_DEV_NAME	"zfs"
 #define	ZFS_DEV		"/dev/" ZFS_DEV_NAME
 
 /* general zvol path */
 #define	ZVOL_DIR		"/dev/zvol"
 /* expansion */
 #define	ZVOL_PSEUDO_DEV		"/devices/pseudo/zfs@0:"
 /* for dump and swap */
 #define	ZVOL_FULL_DEV_DIR	ZVOL_DIR "/dsk/"
 #define	ZVOL_FULL_RDEV_DIR	ZVOL_DIR "/rdsk/"
 
 #define	ZVOL_PROP_NAME		"name"
 #define	ZVOL_DEFAULT_BLOCKSIZE	8192
 
 /*
  * /dev/zfs ioctl numbers.
  */
 typedef enum zfs_ioc {
 	ZFS_IOC_FIRST =	0,
 	ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST,
 	ZFS_IOC_POOL_DESTROY,
 	ZFS_IOC_POOL_IMPORT,
 	ZFS_IOC_POOL_EXPORT,
 	ZFS_IOC_POOL_CONFIGS,
 	ZFS_IOC_POOL_STATS,
 	ZFS_IOC_POOL_TRYIMPORT,
 	ZFS_IOC_POOL_SCAN,
 	ZFS_IOC_POOL_FREEZE,
 	ZFS_IOC_POOL_UPGRADE,
 	ZFS_IOC_POOL_GET_HISTORY,
 	ZFS_IOC_VDEV_ADD,
 	ZFS_IOC_VDEV_REMOVE,
 	ZFS_IOC_VDEV_SET_STATE,
 	ZFS_IOC_VDEV_ATTACH,
 	ZFS_IOC_VDEV_DETACH,
 	ZFS_IOC_VDEV_SETPATH,
 	ZFS_IOC_VDEV_SETFRU,
 	ZFS_IOC_OBJSET_STATS,
 	ZFS_IOC_OBJSET_ZPLPROPS,
 	ZFS_IOC_DATASET_LIST_NEXT,
 	ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	ZFS_IOC_SET_PROP,
 	ZFS_IOC_CREATE,
 	ZFS_IOC_DESTROY,
 	ZFS_IOC_ROLLBACK,
 	ZFS_IOC_RENAME,
 	ZFS_IOC_RECV,
 	ZFS_IOC_SEND,
 	ZFS_IOC_INJECT_FAULT,
 	ZFS_IOC_CLEAR_FAULT,
 	ZFS_IOC_INJECT_LIST_NEXT,
 	ZFS_IOC_ERROR_LOG,
 	ZFS_IOC_CLEAR,
 	ZFS_IOC_PROMOTE,
 	ZFS_IOC_DESTROY_SNAPS,
 	ZFS_IOC_SNAPSHOT,
 	ZFS_IOC_DSOBJ_TO_DSNAME,
 	ZFS_IOC_OBJ_TO_PATH,
 	ZFS_IOC_POOL_SET_PROPS,
 	ZFS_IOC_POOL_GET_PROPS,
 	ZFS_IOC_SET_FSACL,
 	ZFS_IOC_GET_FSACL,
 	ZFS_IOC_SHARE,
 	ZFS_IOC_INHERIT_PROP,
 	ZFS_IOC_SMB_ACL,
 	ZFS_IOC_USERSPACE_ONE,
 	ZFS_IOC_USERSPACE_MANY,
 	ZFS_IOC_USERSPACE_UPGRADE,
 	ZFS_IOC_HOLD,
 	ZFS_IOC_RELEASE,
 	ZFS_IOC_GET_HOLDS,
 	ZFS_IOC_OBJSET_RECVD_PROPS,
 	ZFS_IOC_VDEV_SPLIT,
 	ZFS_IOC_NEXT_OBJ,
 	ZFS_IOC_DIFF,
 	ZFS_IOC_TMP_SNAPSHOT,
 	ZFS_IOC_OBJ_TO_STATS,
 	ZFS_IOC_JAIL,
 	ZFS_IOC_UNJAIL,
 	ZFS_IOC_POOL_REGUID,
 	ZFS_IOC_SPACE_WRITTEN,
 	ZFS_IOC_SPACE_SNAPS,
 	ZFS_IOC_SEND_PROGRESS,
 	ZFS_IOC_POOL_REOPEN,
 	ZFS_IOC_LOG_HISTORY,
 	ZFS_IOC_SEND_NEW,
 	ZFS_IOC_SEND_SPACE,
 	ZFS_IOC_CLONE,
 	ZFS_IOC_BOOKMARK,
 	ZFS_IOC_GET_BOOKMARKS,
 	ZFS_IOC_DESTROY_BOOKMARKS,
 	ZFS_IOC_LAST
 } zfs_ioc_t;
 
 /*
  * Internal SPA load state.  Used by FMA diagnosis engine.
  */
 typedef enum {
 	SPA_LOAD_NONE,		/* no load in progress	*/
 	SPA_LOAD_OPEN,		/* normal open		*/
 	SPA_LOAD_IMPORT,	/* import in progress	*/
 	SPA_LOAD_TRYIMPORT,	/* tryimport in progress */
 	SPA_LOAD_RECOVER,	/* recovery requested	*/
 	SPA_LOAD_ERROR		/* load failed		*/
 } spa_load_state_t;
 
 /*
  * Bookmark name values.
  */
 #define	ZPOOL_ERR_LIST		"error list"
 #define	ZPOOL_ERR_DATASET	"dataset"
 #define	ZPOOL_ERR_OBJECT	"object"
 
 #define	HIS_MAX_RECORD_LEN	(MAXPATHLEN + MAXPATHLEN + 1)
 
 /*
  * The following are names used in the nvlist describing
  * the pool's history log.
  */
 #define	ZPOOL_HIST_RECORD	"history record"
 #define	ZPOOL_HIST_TIME		"history time"
 #define	ZPOOL_HIST_CMD		"history command"
 #define	ZPOOL_HIST_WHO		"history who"
 #define	ZPOOL_HIST_ZONE		"history zone"
 #define	ZPOOL_HIST_HOST		"history hostname"
 #define	ZPOOL_HIST_TXG		"history txg"
 #define	ZPOOL_HIST_INT_EVENT	"history internal event"
 #define	ZPOOL_HIST_INT_STR	"history internal str"
 #define	ZPOOL_HIST_INT_NAME	"internal_name"
 #define	ZPOOL_HIST_IOCTL	"ioctl"
 #define	ZPOOL_HIST_INPUT_NVL	"in_nvl"
 #define	ZPOOL_HIST_OUTPUT_NVL	"out_nvl"
 #define	ZPOOL_HIST_DSNAME	"dsname"
 #define	ZPOOL_HIST_DSID		"dsid"
 
 /*
  * Flags for ZFS_IOC_VDEV_SET_STATE
  */
 #define	ZFS_ONLINE_CHECKREMOVE	0x1
 #define	ZFS_ONLINE_UNSPARE	0x2
 #define	ZFS_ONLINE_FORCEFAULT	0x4
 #define	ZFS_ONLINE_EXPAND	0x8
 #define	ZFS_OFFLINE_TEMPORARY	0x1
 
 /*
  * Flags for ZFS_IOC_POOL_IMPORT
  */
 #define	ZFS_IMPORT_NORMAL	0x0
 #define	ZFS_IMPORT_VERBATIM	0x1
 #define	ZFS_IMPORT_ANY_HOST	0x2
 #define	ZFS_IMPORT_MISSING_LOG	0x4
 #define	ZFS_IMPORT_ONLY		0x8
 
 /*
  * Sysevent payload members.  ZFS will generate the following sysevents with the
  * given payloads:
  *
  *	ESC_ZFS_RESILVER_START
  *	ESC_ZFS_RESILVER_END
  *	ESC_ZFS_POOL_DESTROY
  *	ESC_ZFS_POOL_REGUID
  *
  *		ZFS_EV_POOL_NAME	DATA_TYPE_STRING
  *		ZFS_EV_POOL_GUID	DATA_TYPE_UINT64
  *
  *	ESC_ZFS_VDEV_REMOVE
  *	ESC_ZFS_VDEV_CLEAR
  *	ESC_ZFS_VDEV_CHECK
  *
  *		ZFS_EV_POOL_NAME	DATA_TYPE_STRING
  *		ZFS_EV_POOL_GUID	DATA_TYPE_UINT64
  *		ZFS_EV_VDEV_PATH	DATA_TYPE_STRING	(optional)
  *		ZFS_EV_VDEV_GUID	DATA_TYPE_UINT64
  */
 #define	ZFS_EV_POOL_NAME	"pool_name"
 #define	ZFS_EV_POOL_GUID	"pool_guid"
 #define	ZFS_EV_VDEV_PATH	"vdev_path"
 #define	ZFS_EV_VDEV_GUID	"vdev_guid"
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_FS_ZFS_H */
Index: head/sys/cddl/contrib/opensolaris
===================================================================
--- head/sys/cddl/contrib/opensolaris	(revision 296518)
+++ head/sys/cddl/contrib/opensolaris	(revision 296519)

Property changes on: head/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor-sys/illumos/dist:r296518